Actual source code: sbaijfact2.c


  2: /*
  3:     Factorization code for SBAIJ format.
  4: */

  6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7: #include <../src/mat/impls/baij/seq/baij.h>
  8: #include <petsc/private/kernels/blockinvert.h>

 10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 11: {
 12:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
 13:   IS                isrow=a->row;
 14:   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
 15:   PetscErrorCode    ierr;
 16:   const PetscInt    *r;
 17:   PetscInt          nz,*vj,k,idx,k1;
 18:   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
 19:   const MatScalar   *aa=a->a,*v,*diag;
 20:   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
 21:   const PetscScalar *b;

 24:   VecGetArrayRead(bb,&b);
 25:   VecGetArray(xx,&x);
 26:   t    = a->solve_work;
 27:   ISGetIndices(isrow,&r);
 28:   PetscMalloc1(bs,&xk_tmp);

 30:   /* solve U^T * D * y = b by forward substitution */
 31:   xk = t;
 32:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 33:     idx = bs*r[k];
 34:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 35:   }
 36:   for (k=0; k<mbs; k++) {
 37:     v    = aa + bs2*ai[k];
 38:     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
 39:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
 40:     nz   = ai[k+1] - ai[k];
 41:     vj   = aj + ai[k];
 42:     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
 43:     while (nz--) {
 44:       /* x(:) += U(k,:)^T*(Dk*xk) */
 45:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 46:       vj++; xj = t + (*vj)*bs;
 47:       v       += bs2;
 48:     }
 49:     /* xk = inv(Dk)*(Dk*xk) */
 50:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 51:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 52:   }

 54:   /* solve U*x = y by back substitution */
 55:   for (k=mbs-1; k>=0; k--) {
 56:     v  = aa + bs2*ai[k];
 57:     xk = t + k*bs;        /* xk */
 58:     nz = ai[k+1] - ai[k];
 59:     vj = aj + ai[k];
 60:     xj = t + (*vj)*bs;
 61:     while (nz--) {
 62:       /* xk += U(k,:)*x(:) */
 63:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 64:       vj++;
 65:       v += bs2; xj = t + (*vj)*bs;
 66:     }
 67:     idx = bs*r[k];
 68:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 69:   }

 71:   PetscFree(xk_tmp);
 72:   ISRestoreIndices(isrow,&r);
 73:   VecRestoreArrayRead(bb,&b);
 74:   VecRestoreArray(xx,&x);
 75:   PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
 76:   return(0);
 77: }

 79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 80: {
 82:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
 83: }

 85: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 86: {
 88:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
 89: }

 91: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
 92: {
 93:   PetscErrorCode  ierr;
 94:   PetscInt        nz,k;
 95:   const PetscInt  *vj,bs2 = bs*bs;
 96:   const MatScalar *v,*diag;
 97:   PetscScalar     *xk,*xj,*xk_tmp;

100:   PetscMalloc1(bs,&xk_tmp);
101:   for (k=0; k<mbs; k++) {
102:     v    = aa + bs2*ai[k];
103:     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
104:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
105:     nz   = ai[k+1] - ai[k];
106:     vj   = aj + ai[k];
107:     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
108:     while (nz--) {
109:       /* x(:) += U(k,:)^T*(Dk*xk) */
110:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
111:       vj++; xj = x + (*vj)*bs;
112:       v       += bs2;
113:     }
114:     /* xk = inv(Dk)*(Dk*xk) */
115:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
116:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
117:   }
118:   PetscFree(xk_tmp);
119:   return(0);
120: }

122: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
123: {
124:   PetscInt        nz,k;
125:   const PetscInt  *vj,bs2 = bs*bs;
126:   const MatScalar *v;
127:   PetscScalar     *xk,*xj;

130:   for (k=mbs-1; k>=0; k--) {
131:     v  = aa + bs2*ai[k];
132:     xk = x + k*bs;        /* xk */
133:     nz = ai[k+1] - ai[k];
134:     vj = aj + ai[k];
135:     xj = x + (*vj)*bs;
136:     while (nz--) {
137:       /* xk += U(k,:)*x(:) */
138:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
139:       vj++;
140:       v += bs2; xj = x + (*vj)*bs;
141:     }
142:   }
143:   return(0);
144: }

146: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
147: {
148:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
149:   PetscErrorCode    ierr;
150:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
151:   PetscInt          bs =A->rmap->bs;
152:   const MatScalar   *aa=a->a;
153:   PetscScalar       *x;
154:   const PetscScalar *b;

157:   VecGetArrayRead(bb,&b);
158:   VecGetArray(xx,&x);

160:   /* solve U^T * D * y = b by forward substitution */
161:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
162:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

164:   /* solve U*x = y by back substitution */
165:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

167:   VecRestoreArrayRead(bb,&b);
168:   VecRestoreArray(xx,&x);
169:   PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
170:   return(0);
171: }

173: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
174: {
175:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
176:   PetscErrorCode    ierr;
177:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
178:   PetscInt          bs =A->rmap->bs;
179:   const MatScalar   *aa=a->a;
180:   const PetscScalar *b;
181:   PetscScalar       *x;

184:   VecGetArrayRead(bb,&b);
185:   VecGetArray(xx,&x);
186:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
187:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
188:   VecRestoreArrayRead(bb,&b);
189:   VecRestoreArray(xx,&x);
190:   PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
191:   return(0);
192: }

194: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
195: {
196:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
197:   PetscErrorCode    ierr;
198:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
199:   PetscInt          bs =A->rmap->bs;
200:   const MatScalar   *aa=a->a;
201:   const PetscScalar *b;
202:   PetscScalar       *x;

205:   VecGetArrayRead(bb,&b);
206:   VecGetArray(xx,&x);
207:   PetscArraycpy(x,b,bs*mbs);
208:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
209:   VecRestoreArrayRead(bb,&b);
210:   VecRestoreArray(xx,&x);
211:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
212:   return(0);
213: }

215: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
216: {
217:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
218:   IS                isrow=a->row;
219:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
220:   PetscErrorCode    ierr;
221:   PetscInt          nz,k,idx;
222:   const MatScalar   *aa=a->a,*v,*d;
223:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
224:   const PetscScalar *b;

227:   VecGetArrayRead(bb,&b);
228:   VecGetArray(xx,&x);
229:   t    = a->solve_work;
230:   ISGetIndices(isrow,&r);

232:   /* solve U^T * D * y = b by forward substitution */
233:   tp = t;
234:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
235:     idx   = 7*r[k];
236:     tp[0] = b[idx];
237:     tp[1] = b[idx+1];
238:     tp[2] = b[idx+2];
239:     tp[3] = b[idx+3];
240:     tp[4] = b[idx+4];
241:     tp[5] = b[idx+5];
242:     tp[6] = b[idx+6];
243:     tp   += 7;
244:   }

246:   for (k=0; k<mbs; k++) {
247:     v  = aa + 49*ai[k];
248:     vj = aj + ai[k];
249:     tp = t + k*7;
250:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
251:     nz = ai[k+1] - ai[k];
252:     tp = t + (*vj)*7;
253:     while (nz--) {
254:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
255:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
256:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
257:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
258:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
259:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
260:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
261:       vj++;
262:       tp = t + (*vj)*7;
263:       v += 49;
264:     }

266:     /* xk = inv(Dk)*(Dk*xk) */
267:     d     = aa+k*49;       /* ptr to inv(Dk) */
268:     tp    = t + k*7;
269:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
270:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
271:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
272:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
273:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
274:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
275:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
276:   }

278:   /* solve U*x = y by back substitution */
279:   for (k=mbs-1; k>=0; k--) {
280:     v  = aa + 49*ai[k];
281:     vj = aj + ai[k];
282:     tp = t + k*7;
283:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
284:     nz = ai[k+1] - ai[k];

286:     tp = t + (*vj)*7;
287:     while (nz--) {
288:       /* xk += U(k,:)*x(:) */
289:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
290:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
291:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
292:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
293:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
294:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
295:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
296:       vj++;
297:       tp = t + (*vj)*7;
298:       v += 49;
299:     }
300:     tp       = t + k*7;
301:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
302:     idx      = 7*r[k];
303:     x[idx]   = x0;
304:     x[idx+1] = x1;
305:     x[idx+2] = x2;
306:     x[idx+3] = x3;
307:     x[idx+4] = x4;
308:     x[idx+5] = x5;
309:     x[idx+6] = x6;
310:   }

312:   ISRestoreIndices(isrow,&r);
313:   VecRestoreArrayRead(bb,&b);
314:   VecRestoreArray(xx,&x);
315:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
316:   return(0);
317: }

319: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
320: {
321:   const MatScalar *v,*d;
322:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
323:   PetscInt        nz,k;
324:   const PetscInt  *vj;

327:   for (k=0; k<mbs; k++) {
328:     v  = aa + 49*ai[k];
329:     xp = x + k*7;
330:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
331:     nz = ai[k+1] - ai[k];
332:     vj = aj + ai[k];
333:     xp = x + (*vj)*7;
334:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
335:     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
336:     while (nz--) {
337:       /* x(:) += U(k,:)^T*(Dk*xk) */
338:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
339:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
340:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
341:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
342:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
343:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
344:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
345:       vj++;
346:       xp = x + (*vj)*7;
347:       v += 49;
348:     }
349:     /* xk = inv(Dk)*(Dk*xk) */
350:     d     = aa+k*49;       /* ptr to inv(Dk) */
351:     xp    = x + k*7;
352:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
353:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
354:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
355:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
356:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
357:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
358:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
359:   }
360:   return(0);
361: }

363: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
364: {
365:   const MatScalar *v;
366:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
367:   PetscInt        nz,k;
368:   const PetscInt  *vj;

371:   for (k=mbs-1; k>=0; k--) {
372:     v  = aa + 49*ai[k];
373:     xp = x + k*7;
374:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
375:     nz = ai[k+1] - ai[k];
376:     vj = aj + ai[k];
377:     xp = x + (*vj)*7;
378:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
379:     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
380:     while (nz--) {
381:       /* xk += U(k,:)*x(:) */
382:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
383:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
384:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
385:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
386:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
387:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
388:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
389:       vj++;
390:       v += 49; xp = x + (*vj)*7;
391:     }
392:     xp   = x + k*7;
393:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
394:   }
395:   return(0);
396: }

398: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
399: {
400:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
401:   PetscErrorCode    ierr;
402:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
403:   const MatScalar   *aa=a->a;
404:   PetscScalar       *x;
405:   const PetscScalar *b;

408:   VecGetArrayRead(bb,&b);
409:   VecGetArray(xx,&x);

411:   /* solve U^T * D * y = b by forward substitution */
412:   PetscArraycpy(x,b,7*mbs); /* x <- b */
413:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

415:   /* solve U*x = y by back substitution */
416:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

418:   VecRestoreArrayRead(bb,&b);
419:   VecRestoreArray(xx,&x);
420:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
421:   return(0);
422: }

424: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425: {
426:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
427:   PetscErrorCode    ierr;
428:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
429:   const MatScalar   *aa=a->a;
430:   PetscScalar       *x;
431:   const PetscScalar *b;

434:   VecGetArrayRead(bb,&b);
435:   VecGetArray(xx,&x);
436:   PetscArraycpy(x,b,7*mbs);
437:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
438:   VecRestoreArrayRead(bb,&b);
439:   VecRestoreArray(xx,&x);
440:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
441:   return(0);
442: }

444: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
445: {
446:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
447:   PetscErrorCode    ierr;
448:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
449:   const MatScalar   *aa=a->a;
450:   PetscScalar       *x;
451:   const PetscScalar *b;

454:   VecGetArrayRead(bb,&b);
455:   VecGetArray(xx,&x);
456:   PetscArraycpy(x,b,7*mbs);
457:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
458:   VecRestoreArrayRead(bb,&b);
459:   VecRestoreArray(xx,&x);
460:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
461:   return(0);
462: }

464: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
465: {
466:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
467:   IS                isrow=a->row;
468:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
469:   PetscErrorCode    ierr;
470:   PetscInt          nz,k,idx;
471:   const MatScalar   *aa=a->a,*v,*d;
472:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
473:   const PetscScalar *b;

476:   VecGetArrayRead(bb,&b);
477:   VecGetArray(xx,&x);
478:   t    = a->solve_work;
479:   ISGetIndices(isrow,&r);

481:   /* solve U^T * D * y = b by forward substitution */
482:   tp = t;
483:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
484:     idx   = 6*r[k];
485:     tp[0] = b[idx];
486:     tp[1] = b[idx+1];
487:     tp[2] = b[idx+2];
488:     tp[3] = b[idx+3];
489:     tp[4] = b[idx+4];
490:     tp[5] = b[idx+5];
491:     tp   += 6;
492:   }

494:   for (k=0; k<mbs; k++) {
495:     v  = aa + 36*ai[k];
496:     vj = aj + ai[k];
497:     tp = t + k*6;
498:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
499:     nz = ai[k+1] - ai[k];
500:     tp = t + (*vj)*6;
501:     while (nz--) {
502:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
503:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
504:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
505:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
506:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
507:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
508:       vj++;
509:       tp = t + (*vj)*6;
510:       v += 36;
511:     }

513:     /* xk = inv(Dk)*(Dk*xk) */
514:     d     = aa+k*36;       /* ptr to inv(Dk) */
515:     tp    = t + k*6;
516:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
517:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
518:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
519:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
520:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
521:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
522:   }

524:   /* solve U*x = y by back substitution */
525:   for (k=mbs-1; k>=0; k--) {
526:     v  = aa + 36*ai[k];
527:     vj = aj + ai[k];
528:     tp = t + k*6;
529:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
530:     nz = ai[k+1] - ai[k];

532:     tp = t + (*vj)*6;
533:     while (nz--) {
534:       /* xk += U(k,:)*x(:) */
535:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
536:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
537:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
538:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
539:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
540:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
541:       vj++;
542:       tp = t + (*vj)*6;
543:       v += 36;
544:     }
545:     tp       = t + k*6;
546:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
547:     idx      = 6*r[k];
548:     x[idx]   = x0;
549:     x[idx+1] = x1;
550:     x[idx+2] = x2;
551:     x[idx+3] = x3;
552:     x[idx+4] = x4;
553:     x[idx+5] = x5;
554:   }

556:   ISRestoreIndices(isrow,&r);
557:   VecRestoreArrayRead(bb,&b);
558:   VecRestoreArray(xx,&x);
559:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
560:   return(0);
561: }

563: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
564: {
565:   const MatScalar *v,*d;
566:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
567:   PetscInt        nz,k;
568:   const PetscInt  *vj;

571:   for (k=0; k<mbs; k++) {
572:     v  = aa + 36*ai[k];
573:     xp = x + k*6;
574:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
575:     nz = ai[k+1] - ai[k];
576:     vj = aj + ai[k];
577:     xp = x + (*vj)*6;
578:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
579:     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580:     while (nz--) {
581:       /* x(:) += U(k,:)^T*(Dk*xk) */
582:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
583:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
584:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
585:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
586:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
587:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
588:       vj++;
589:       xp = x + (*vj)*6;
590:       v += 36;
591:     }
592:     /* xk = inv(Dk)*(Dk*xk) */
593:     d     = aa+k*36;       /* ptr to inv(Dk) */
594:     xp    = x + k*6;
595:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
596:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
597:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
598:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
599:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
600:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
601:   }
602:   return(0);
603: }
604: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
605: {
606:   const MatScalar   *v;
607:   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
608:   PetscInt          nz,k;
609:   const PetscInt    *vj;

612:   for (k=mbs-1; k>=0; k--) {
613:     v  = aa + 36*ai[k];
614:     xp = x + k*6;
615:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
616:     nz = ai[k+1] - ai[k];
617:     vj = aj + ai[k];
618:     xp = x + (*vj)*6;
619:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
620:     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
621:     while (nz--) {
622:       /* xk += U(k,:)*x(:) */
623:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
624:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
625:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
626:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
627:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
628:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
629:       vj++;
630:       v += 36; xp = x + (*vj)*6;
631:     }
632:     xp   = x + k*6;
633:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
634:   }
635:   return(0);
636: }

638: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
639: {
640:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
641:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
642:   const MatScalar   *aa=a->a;
643:   PetscScalar       *x;
644:   const PetscScalar *b;
645:   PetscErrorCode    ierr;

648:   VecGetArrayRead(bb,&b);
649:   VecGetArray(xx,&x);

651:   /* solve U^T * D * y = b by forward substitution */
652:   PetscArraycpy(x,b,6*mbs); /* x <- b */
653:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

655:   /* solve U*x = y by back substitution */
656:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

658:   VecRestoreArrayRead(bb,&b);
659:   VecRestoreArray(xx,&x);
660:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
661:   return(0);
662: }

664: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
665: {
666:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
667:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
668:   const MatScalar   *aa=a->a;
669:   PetscScalar       *x;
670:   const PetscScalar *b;
671:   PetscErrorCode    ierr;

674:   VecGetArrayRead(bb,&b);
675:   VecGetArray(xx,&x);
676:   PetscArraycpy(x,b,6*mbs); /* x <- b */
677:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
678:   VecRestoreArrayRead(bb,&b);
679:   VecRestoreArray(xx,&x);
680:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
681:   return(0);
682: }

684: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
685: {
686:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
687:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
688:   const MatScalar   *aa=a->a;
689:   PetscScalar       *x;
690:   const PetscScalar *b;
691:   PetscErrorCode    ierr;

694:   VecGetArrayRead(bb,&b);
695:   VecGetArray(xx,&x);
696:   PetscArraycpy(x,b,6*mbs); /* x <- b */
697:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
698:   VecRestoreArrayRead(bb,&b);
699:   VecRestoreArray(xx,&x);
700:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
701:   return(0);
702: }

704: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
705: {
706:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
707:   IS                isrow=a->row;
708:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
709:   PetscErrorCode    ierr;
710:   const PetscInt    *r,*vj;
711:   PetscInt          nz,k,idx;
712:   const MatScalar   *aa=a->a,*v,*diag;
713:   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
714:   const PetscScalar *b;

717:   VecGetArrayRead(bb,&b);
718:   VecGetArray(xx,&x);
719:   t    = a->solve_work;
720:   ISGetIndices(isrow,&r);

722:   /* solve U^T * D * y = b by forward substitution */
723:   tp = t;
724:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
725:     idx   = 5*r[k];
726:     tp[0] = b[idx];
727:     tp[1] = b[idx+1];
728:     tp[2] = b[idx+2];
729:     tp[3] = b[idx+3];
730:     tp[4] = b[idx+4];
731:     tp   += 5;
732:   }

734:   for (k=0; k<mbs; k++) {
735:     v  = aa + 25*ai[k];
736:     vj = aj + ai[k];
737:     tp = t + k*5;
738:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
739:     nz = ai[k+1] - ai[k];

741:     tp = t + (*vj)*5;
742:     while (nz--) {
743:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
744:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
745:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
746:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
747:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
748:       vj++;
749:       tp = t + (*vj)*5;
750:       v += 25;
751:     }

753:     /* xk = inv(Dk)*(Dk*xk) */
754:     diag  = aa+k*25;          /* ptr to inv(Dk) */
755:     tp    = t + k*5;
756:     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
757:     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
758:     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
759:     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
760:     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
761:   }

763:   /* solve U*x = y by back substitution */
764:   for (k=mbs-1; k>=0; k--) {
765:     v  = aa + 25*ai[k];
766:     vj = aj + ai[k];
767:     tp = t + k*5;
768:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
769:     nz = ai[k+1] - ai[k];

771:     tp = t + (*vj)*5;
772:     while (nz--) {
773:       /* xk += U(k,:)*x(:) */
774:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
775:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
776:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
777:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
778:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
779:       vj++;
780:       tp = t + (*vj)*5;
781:       v += 25;
782:     }
783:     tp       = t + k*5;
784:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
785:     idx      = 5*r[k];
786:     x[idx]   = x0;
787:     x[idx+1] = x1;
788:     x[idx+2] = x2;
789:     x[idx+3] = x3;
790:     x[idx+4] = x4;
791:   }

793:   ISRestoreIndices(isrow,&r);
794:   VecRestoreArrayRead(bb,&b);
795:   VecRestoreArray(xx,&x);
796:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
797:   return(0);
798: }

800: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
801: {
802:   const MatScalar *v,*diag;
803:   PetscScalar     *xp,x0,x1,x2,x3,x4;
804:   PetscInt        nz,k;
805:   const PetscInt  *vj;

808:   for (k=0; k<mbs; k++) {
809:     v  = aa + 25*ai[k];
810:     xp = x + k*5;
811:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
812:     nz = ai[k+1] - ai[k];
813:     vj = aj + ai[k];
814:     xp = x + (*vj)*5;
815:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
816:     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
817:     while (nz--) {
818:       /* x(:) += U(k,:)^T*(Dk*xk) */
819:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
820:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
821:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
822:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
823:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
824:       vj++;
825:       xp = x + (*vj)*5;
826:       v += 25;
827:     }
828:     /* xk = inv(Dk)*(Dk*xk) */
829:     diag  = aa+k*25;         /* ptr to inv(Dk) */
830:     xp    = x + k*5;
831:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
832:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
833:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
834:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
835:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
836:   }
837:   return(0);
838: }

840: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
841: {
842:   const MatScalar *v;
843:   PetscScalar     *xp,x0,x1,x2,x3,x4;
844:   PetscInt        nz,k;
845:   const PetscInt  *vj;

848:   for (k=mbs-1; k>=0; k--) {
849:     v  = aa + 25*ai[k];
850:     xp = x + k*5;
851:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
852:     nz = ai[k+1] - ai[k];
853:     vj = aj + ai[k];
854:     xp = x + (*vj)*5;
855:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
856:     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
857:     while (nz--) {
858:       /* xk += U(k,:)*x(:) */
859:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
860:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
861:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
862:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
863:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
864:       vj++;
865:       v += 25; xp = x + (*vj)*5;
866:     }
867:     xp   = x + k*5;
868:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
869:   }
870:   return(0);
871: }

873: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
874: {
875:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
876:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
877:   const MatScalar   *aa=a->a;
878:   PetscScalar       *x;
879:   const PetscScalar *b;
880:   PetscErrorCode    ierr;

883:   VecGetArrayRead(bb,&b);
884:   VecGetArray(xx,&x);

886:   /* solve U^T * D * y = b by forward substitution */
887:   PetscArraycpy(x,b,5*mbs); /* x <- b */
888:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

890:   /* solve U*x = y by back substitution */
891:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

893:   VecRestoreArrayRead(bb,&b);
894:   VecRestoreArray(xx,&x);
895:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
896:   return(0);
897: }

899: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
900: {
901:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
902:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
903:   const MatScalar   *aa=a->a;
904:   PetscScalar       *x;
905:   const PetscScalar *b;
906:   PetscErrorCode    ierr;

909:   VecGetArrayRead(bb,&b);
910:   VecGetArray(xx,&x);
911:   PetscArraycpy(x,b,5*mbs); /* x <- b */
912:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
913:   VecRestoreArrayRead(bb,&b);
914:   VecRestoreArray(xx,&x);
915:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
916:   return(0);
917: }

919: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
920: {
921:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
922:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
923:   const MatScalar   *aa=a->a;
924:   PetscScalar       *x;
925:   const PetscScalar *b;
926:   PetscErrorCode    ierr;

929:   VecGetArrayRead(bb,&b);
930:   VecGetArray(xx,&x);
931:   PetscArraycpy(x,b,5*mbs);
932:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
933:   VecRestoreArrayRead(bb,&b);
934:   VecRestoreArray(xx,&x);
935:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
936:   return(0);
937: }

939: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
940: {
941:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
942:   IS                isrow=a->row;
943:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
944:   PetscErrorCode    ierr;
945:   const PetscInt    *r,*vj;
946:   PetscInt          nz,k,idx;
947:   const MatScalar   *aa=a->a,*v,*diag;
948:   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
949:   const PetscScalar *b;

952:   VecGetArrayRead(bb,&b);
953:   VecGetArray(xx,&x);
954:   t    = a->solve_work;
955:   ISGetIndices(isrow,&r);

957:   /* solve U^T * D * y = b by forward substitution */
958:   tp = t;
959:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
960:     idx   = 4*r[k];
961:     tp[0] = b[idx];
962:     tp[1] = b[idx+1];
963:     tp[2] = b[idx+2];
964:     tp[3] = b[idx+3];
965:     tp   += 4;
966:   }

968:   for (k=0; k<mbs; k++) {
969:     v  = aa + 16*ai[k];
970:     vj = aj + ai[k];
971:     tp = t + k*4;
972:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
973:     nz = ai[k+1] - ai[k];

975:     tp = t + (*vj)*4;
976:     while (nz--) {
977:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
978:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
979:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
980:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
981:       vj++;
982:       tp = t + (*vj)*4;
983:       v += 16;
984:     }

986:     /* xk = inv(Dk)*(Dk*xk) */
987:     diag  = aa+k*16;          /* ptr to inv(Dk) */
988:     tp    = t + k*4;
989:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
990:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
991:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
992:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
993:   }

995:   /* solve U*x = y by back substitution */
996:   for (k=mbs-1; k>=0; k--) {
997:     v  = aa + 16*ai[k];
998:     vj = aj + ai[k];
999:     tp = t + k*4;
1000:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1001:     nz = ai[k+1] - ai[k];

1003:     tp = t + (*vj)*4;
1004:     while (nz--) {
1005:       /* xk += U(k,:)*x(:) */
1006:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1007:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1008:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1009:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1010:       vj++;
1011:       tp = t + (*vj)*4;
1012:       v += 16;
1013:     }
1014:     tp       = t + k*4;
1015:     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1016:     idx      = 4*r[k];
1017:     x[idx]   = x0;
1018:     x[idx+1] = x1;
1019:     x[idx+2] = x2;
1020:     x[idx+3] = x3;
1021:   }

1023:   ISRestoreIndices(isrow,&r);
1024:   VecRestoreArrayRead(bb,&b);
1025:   VecRestoreArray(xx,&x);
1026:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1027:   return(0);
1028: }

1030: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1031: {
1032:   const MatScalar *v,*diag;
1033:   PetscScalar     *xp,x0,x1,x2,x3;
1034:   PetscInt        nz,k;
1035:   const PetscInt  *vj;

1038:   for (k=0; k<mbs; k++) {
1039:     v  = aa + 16*ai[k];
1040:     xp = x + k*4;
1041:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1042:     nz = ai[k+1] - ai[k];
1043:     vj = aj + ai[k];
1044:     xp = x + (*vj)*4;
1045:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1046:     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1047:     while (nz--) {
1048:       /* x(:) += U(k,:)^T*(Dk*xk) */
1049:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1050:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1051:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1052:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1053:       vj++;
1054:       xp = x + (*vj)*4;
1055:       v += 16;
1056:     }
1057:     /* xk = inv(Dk)*(Dk*xk) */
1058:     diag  = aa+k*16;         /* ptr to inv(Dk) */
1059:     xp    = x + k*4;
1060:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1061:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1062:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1063:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1064:   }
1065:   return(0);
1066: }

1068: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1069: {
1070:   const MatScalar *v;
1071:   PetscScalar     *xp,x0,x1,x2,x3;
1072:   PetscInt        nz,k;
1073:   const PetscInt  *vj;

1076:   for (k=mbs-1; k>=0; k--) {
1077:     v  = aa + 16*ai[k];
1078:     xp = x + k*4;
1079:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1080:     nz = ai[k+1] - ai[k];
1081:     vj = aj + ai[k];
1082:     xp = x + (*vj)*4;
1083:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1084:     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1085:     while (nz--) {
1086:       /* xk += U(k,:)*x(:) */
1087:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1088:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1089:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1090:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1091:       vj++;
1092:       v += 16; xp = x + (*vj)*4;
1093:     }
1094:     xp    = x + k*4;
1095:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1096:   }
1097:   return(0);
1098: }

1100: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1101: {
1102:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1103:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1104:   const MatScalar   *aa=a->a;
1105:   PetscScalar       *x;
1106:   const PetscScalar *b;
1107:   PetscErrorCode    ierr;

1110:   VecGetArrayRead(bb,&b);
1111:   VecGetArray(xx,&x);

1113:   /* solve U^T * D * y = b by forward substitution */
1114:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1115:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);

1117:   /* solve U*x = y by back substitution */
1118:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1119:   VecRestoreArrayRead(bb,&b);
1120:   VecRestoreArray(xx,&x);
1121:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1122:   return(0);
1123: }

1125: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1126: {
1127:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1128:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1129:   const MatScalar   *aa=a->a;
1130:   PetscScalar       *x;
1131:   const PetscScalar *b;
1132:   PetscErrorCode    ierr;

1135:   VecGetArrayRead(bb,&b);
1136:   VecGetArray(xx,&x);
1137:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1138:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1139:   VecRestoreArrayRead(bb,&b);
1140:   VecRestoreArray(xx,&x);
1141:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1142:   return(0);
1143: }

1145: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1146: {
1147:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1148:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1149:   const MatScalar   *aa=a->a;
1150:   PetscScalar       *x;
1151:   const PetscScalar *b;
1152:   PetscErrorCode    ierr;

1155:   VecGetArrayRead(bb,&b);
1156:   VecGetArray(xx,&x);
1157:   PetscArraycpy(x,b,4*mbs);
1158:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1159:   VecRestoreArrayRead(bb,&b);
1160:   VecRestoreArray(xx,&x);
1161:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1162:   return(0);
1163: }

1165: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1166: {
1167:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1168:   IS                isrow=a->row;
1169:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1170:   PetscErrorCode    ierr;
1171:   const PetscInt    *r;
1172:   PetscInt          nz,k,idx;
1173:   const PetscInt    *vj;
1174:   const MatScalar   *aa=a->a,*v,*diag;
1175:   PetscScalar       *x,x0,x1,x2,*t,*tp;
1176:   const PetscScalar *b;

1179:   VecGetArrayRead(bb,&b);
1180:   VecGetArray(xx,&x);
1181:   t    = a->solve_work;
1182:   ISGetIndices(isrow,&r);

1184:   /* solve U^T * D * y = b by forward substitution */
1185:   tp = t;
1186:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1187:     idx   = 3*r[k];
1188:     tp[0] = b[idx];
1189:     tp[1] = b[idx+1];
1190:     tp[2] = b[idx+2];
1191:     tp   += 3;
1192:   }

1194:   for (k=0; k<mbs; k++) {
1195:     v  = aa + 9*ai[k];
1196:     vj = aj + ai[k];
1197:     tp = t + k*3;
1198:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1199:     nz = ai[k+1] - ai[k];

1201:     tp = t + (*vj)*3;
1202:     while (nz--) {
1203:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1204:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1205:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1206:       vj++;
1207:       tp = t + (*vj)*3;
1208:       v += 9;
1209:     }

1211:     /* xk = inv(Dk)*(Dk*xk) */
1212:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1213:     tp    = t + k*3;
1214:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1215:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1216:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1217:   }

1219:   /* solve U*x = y by back substitution */
1220:   for (k=mbs-1; k>=0; k--) {
1221:     v  = aa + 9*ai[k];
1222:     vj = aj + ai[k];
1223:     tp = t + k*3;
1224:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1225:     nz = ai[k+1] - ai[k];

1227:     tp = t + (*vj)*3;
1228:     while (nz--) {
1229:       /* xk += U(k,:)*x(:) */
1230:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1231:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1232:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1233:       vj++;
1234:       tp = t + (*vj)*3;
1235:       v += 9;
1236:     }
1237:     tp       = t + k*3;
1238:     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1239:     idx      = 3*r[k];
1240:     x[idx]   = x0;
1241:     x[idx+1] = x1;
1242:     x[idx+2] = x2;
1243:   }

1245:   ISRestoreIndices(isrow,&r);
1246:   VecRestoreArrayRead(bb,&b);
1247:   VecRestoreArray(xx,&x);
1248:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1249:   return(0);
1250: }

1252: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1253: {
1254:   const MatScalar *v,*diag;
1255:   PetscScalar     *xp,x0,x1,x2;
1256:   PetscInt        nz,k;
1257:   const PetscInt  *vj;

1260:   for (k=0; k<mbs; k++) {
1261:     v  = aa + 9*ai[k];
1262:     xp = x + k*3;
1263:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1264:     nz = ai[k+1] - ai[k];
1265:     vj = aj + ai[k];
1266:     xp = x + (*vj)*3;
1267:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1268:     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1269:     while (nz--) {
1270:       /* x(:) += U(k,:)^T*(Dk*xk) */
1271:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1272:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1273:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1274:       vj++;
1275:       xp = x + (*vj)*3;
1276:       v += 9;
1277:     }
1278:     /* xk = inv(Dk)*(Dk*xk) */
1279:     diag  = aa+k*9;         /* ptr to inv(Dk) */
1280:     xp    = x + k*3;
1281:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1282:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1283:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1284:   }
1285:   return(0);
1286: }

1288: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1289: {
1290:   const MatScalar *v;
1291:   PetscScalar     *xp,x0,x1,x2;
1292:   PetscInt        nz,k;
1293:   const PetscInt  *vj;

1296:   for (k=mbs-1; k>=0; k--) {
1297:     v  = aa + 9*ai[k];
1298:     xp = x + k*3;
1299:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1300:     nz = ai[k+1] - ai[k];
1301:     vj = aj + ai[k];
1302:     xp = x + (*vj)*3;
1303:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1304:     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1305:     while (nz--) {
1306:       /* xk += U(k,:)*x(:) */
1307:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1308:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1309:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1310:       vj++;
1311:       v += 9; xp = x + (*vj)*3;
1312:     }
1313:     xp    = x + k*3;
1314:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1315:   }
1316:   return(0);
1317: }

1319: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1320: {
1321:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1322:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1323:   const MatScalar   *aa=a->a;
1324:   PetscScalar       *x;
1325:   const PetscScalar *b;
1326:   PetscErrorCode    ierr;

1329:   VecGetArrayRead(bb,&b);
1330:   VecGetArray(xx,&x);

1332:   /* solve U^T * D * y = b by forward substitution */
1333:   PetscArraycpy(x,b,3*mbs);
1334:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1336:   /* solve U*x = y by back substitution */
1337:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1339:   VecRestoreArrayRead(bb,&b);
1340:   VecRestoreArray(xx,&x);
1341:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1342:   return(0);
1343: }

1345: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1346: {
1347:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1348:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1349:   const MatScalar   *aa=a->a;
1350:   PetscScalar       *x;
1351:   const PetscScalar *b;
1352:   PetscErrorCode    ierr;

1355:   VecGetArrayRead(bb,&b);
1356:   VecGetArray(xx,&x);
1357:   PetscArraycpy(x,b,3*mbs);
1358:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1359:   VecRestoreArrayRead(bb,&b);
1360:   VecRestoreArray(xx,&x);
1361:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1362:   return(0);
1363: }

1365: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1366: {
1367:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1368:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1369:   const MatScalar   *aa=a->a;
1370:   PetscScalar       *x;
1371:   const PetscScalar *b;
1372:   PetscErrorCode    ierr;

1375:   VecGetArrayRead(bb,&b);
1376:   VecGetArray(xx,&x);
1377:   PetscArraycpy(x,b,3*mbs);
1378:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1379:   VecRestoreArrayRead(bb,&b);
1380:   VecRestoreArray(xx,&x);
1381:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1382:   return(0);
1383: }

1385: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1386: {
1387:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1388:   IS                isrow=a->row;
1389:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1390:   PetscErrorCode    ierr;
1391:   const PetscInt    *r,*vj;
1392:   PetscInt          nz,k,k2,idx;
1393:   const MatScalar   *aa=a->a,*v,*diag;
1394:   PetscScalar       *x,x0,x1,*t;
1395:   const PetscScalar *b;

1398:   VecGetArrayRead(bb,&b);
1399:   VecGetArray(xx,&x);
1400:   t    = a->solve_work;
1401:   ISGetIndices(isrow,&r);

1403:   /* solve U^T * D * y = perm(b) by forward substitution */
1404:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1405:     idx      = 2*r[k];
1406:     t[k*2]   = b[idx];
1407:     t[k*2+1] = b[idx+1];
1408:   }
1409:   for (k=0; k<mbs; k++) {
1410:     v  = aa + 4*ai[k];
1411:     vj = aj + ai[k];
1412:     k2 = k*2;
1413:     x0 = t[k2]; x1 = t[k2+1];
1414:     nz = ai[k+1] - ai[k];
1415:     while (nz--) {
1416:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1417:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1418:       vj++; v      += 4;
1419:     }
1420:     diag    = aa+k*4; /* ptr to inv(Dk) */
1421:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1422:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1423:   }

1425:   /* solve U*x = y by back substitution */
1426:   for (k=mbs-1; k>=0; k--) {
1427:     v  = aa + 4*ai[k];
1428:     vj = aj + ai[k];
1429:     k2 = k*2;
1430:     x0 = t[k2]; x1 = t[k2+1];
1431:     nz = ai[k+1] - ai[k];
1432:     while (nz--) {
1433:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1434:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1435:       vj++;
1436:       v += 4;
1437:     }
1438:     t[k2]    = x0;
1439:     t[k2+1]  = x1;
1440:     idx      = 2*r[k];
1441:     x[idx]   = x0;
1442:     x[idx+1] = x1;
1443:   }

1445:   ISRestoreIndices(isrow,&r);
1446:   VecRestoreArrayRead(bb,&b);
1447:   VecRestoreArray(xx,&x);
1448:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1449:   return(0);
1450: }

1452: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1453: {
1454:   const MatScalar *v,*diag;
1455:   PetscScalar     x0,x1;
1456:   PetscInt        nz,k,k2;
1457:   const PetscInt  *vj;

1460:   for (k=0; k<mbs; k++) {
1461:     v  = aa + 4*ai[k];
1462:     vj = aj + ai[k];
1463:     k2 = k*2;
1464:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1465:     nz = ai[k+1] - ai[k];
1466:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1467:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1468:     while (nz--) {
1469:       /* x(:) += U(k,:)^T*(Dk*xk) */
1470:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1471:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1472:       vj++; v      += 4;
1473:     }
1474:     /* xk = inv(Dk)*(Dk*xk) */
1475:     diag    = aa+k*4;       /* ptr to inv(Dk) */
1476:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1477:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1478:   }
1479:   return(0);
1480: }

1482: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1483: {
1484:   const MatScalar *v;
1485:   PetscScalar     x0,x1;
1486:   PetscInt        nz,k,k2;
1487:   const PetscInt  *vj;

1490:   for (k=mbs-1; k>=0; k--) {
1491:     v  = aa + 4*ai[k];
1492:     vj = aj + ai[k];
1493:     k2 = k*2;
1494:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1495:     nz = ai[k+1] - ai[k];
1496:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1497:     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1498:     while (nz--) {
1499:       /* xk += U(k,:)*x(:) */
1500:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1501:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1502:       vj++;
1503:       v += 4;
1504:     }
1505:     x[k2]   = x0;
1506:     x[k2+1] = x1;
1507:   }
1508:   return(0);
1509: }

1511: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1512: {
1513:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1514:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1515:   const MatScalar   *aa=a->a;
1516:   PetscScalar       *x;
1517:   const PetscScalar *b;
1518:   PetscErrorCode    ierr;

1521:   VecGetArrayRead(bb,&b);
1522:   VecGetArray(xx,&x);

1524:   /* solve U^T * D * y = b by forward substitution */
1525:   PetscArraycpy(x,b,2*mbs);
1526:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1528:   /* solve U*x = y by back substitution */
1529:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1531:   VecRestoreArrayRead(bb,&b);
1532:   VecRestoreArray(xx,&x);
1533:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1534:   return(0);
1535: }

1537: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1538: {
1539:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1540:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1541:   const MatScalar   *aa=a->a;
1542:   PetscScalar       *x;
1543:   const PetscScalar *b;
1544:   PetscErrorCode    ierr;

1547:   VecGetArrayRead(bb,&b);
1548:   VecGetArray(xx,&x);
1549:   PetscArraycpy(x,b,2*mbs);
1550:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1551:   VecRestoreArrayRead(bb,&b);
1552:   VecRestoreArray(xx,&x);
1553:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1554:   return(0);
1555: }

1557: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1558: {
1559:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1560:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1561:   const MatScalar   *aa=a->a;
1562:   PetscScalar       *x;
1563:   const PetscScalar *b;
1564:   PetscErrorCode    ierr;

1567:   VecGetArrayRead(bb,&b);
1568:   VecGetArray(xx,&x);
1569:   PetscArraycpy(x,b,2*mbs);
1570:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1571:   VecRestoreArrayRead(bb,&b);
1572:   VecRestoreArray(xx,&x);
1573:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1574:   return(0);
1575: }

1577: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1578: {
1579:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1580:   IS                isrow=a->row;
1581:   PetscErrorCode    ierr;
1582:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1583:   const MatScalar   *aa=a->a,*v;
1584:   const PetscScalar *b;
1585:   PetscScalar       *x,xk,*t;
1586:   PetscInt          nz,k,j;

1589:   VecGetArrayRead(bb,&b);
1590:   VecGetArray(xx,&x);
1591:   t    = a->solve_work;
1592:   ISGetIndices(isrow,&rp);

1594:   /* solve U^T*D*y = perm(b) by forward substitution */
1595:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1596:   for (k=0; k<mbs; k++) {
1597:     v  = aa + ai[k];
1598:     vj = aj + ai[k];
1599:     xk = t[k];
1600:     nz = ai[k+1] - ai[k] - 1;
1601:     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1602:     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1603:   }

1605:   /* solve U*perm(x) = y by back substitution */
1606:   for (k=mbs-1; k>=0; k--) {
1607:     v  = aa + adiag[k] - 1;
1608:     vj = aj + adiag[k] - 1;
1609:     nz = ai[k+1] - ai[k] - 1;
1610:     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1611:     x[rp[k]] = t[k];
1612:   }

1614:   ISRestoreIndices(isrow,&rp);
1615:   VecRestoreArrayRead(bb,&b);
1616:   VecRestoreArray(xx,&x);
1617:   PetscLogFlops(4.0*a->nz - 3.0*mbs);
1618:   return(0);
1619: }

1621: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1622: {
1623:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1624:   IS                isrow=a->row;
1625:   PetscErrorCode    ierr;
1626:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1627:   const MatScalar   *aa=a->a,*v;
1628:   PetscScalar       *x,xk,*t;
1629:   const PetscScalar *b;
1630:   PetscInt          nz,k;

1633:   VecGetArrayRead(bb,&b);
1634:   VecGetArray(xx,&x);
1635:   t    = a->solve_work;
1636:   ISGetIndices(isrow,&rp);

1638:   /* solve U^T*D*y = perm(b) by forward substitution */
1639:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1640:   for (k=0; k<mbs; k++) {
1641:     v  = aa + ai[k] + 1;
1642:     vj = aj + ai[k] + 1;
1643:     xk = t[k];
1644:     nz = ai[k+1] - ai[k] - 1;
1645:     while (nz--) t[*vj++] += (*v++) * xk;
1646:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1647:   }

1649:   /* solve U*perm(x) = y by back substitution */
1650:   for (k=mbs-1; k>=0; k--) {
1651:     v  = aa + ai[k] + 1;
1652:     vj = aj + ai[k] + 1;
1653:     nz = ai[k+1] - ai[k] - 1;
1654:     while (nz--) t[k] += (*v++) * t[*vj++];
1655:     x[rp[k]] = t[k];
1656:   }

1658:   ISRestoreIndices(isrow,&rp);
1659:   VecRestoreArrayRead(bb,&b);
1660:   VecRestoreArray(xx,&x);
1661:   PetscLogFlops(4.0*a->nz - 3*mbs);
1662:   return(0);
1663: }

1665: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1666: {
1667:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1668:   IS                isrow=a->row;
1669:   PetscErrorCode    ierr;
1670:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1671:   const MatScalar   *aa=a->a,*v;
1672:   PetscReal         diagk;
1673:   PetscScalar       *x,xk;
1674:   const PetscScalar *b;
1675:   PetscInt          nz,k;

1678:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1679:   VecGetArrayRead(bb,&b);
1680:   VecGetArray(xx,&x);
1681:   ISGetIndices(isrow,&rp);

1683:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1684:   for (k=0; k<mbs; k++) {
1685:     v  = aa + ai[k];
1686:     vj = aj + ai[k];
1687:     xk = x[k];
1688:     nz = ai[k+1] - ai[k] - 1;
1689:     while (nz--) x[*vj++] += (*v++) * xk;

1691:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1692:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1693:     x[k] = xk*PetscSqrtReal(diagk);
1694:   }
1695:   ISRestoreIndices(isrow,&rp);
1696:   VecRestoreArrayRead(bb,&b);
1697:   VecRestoreArray(xx,&x);
1698:   PetscLogFlops(2.0*a->nz - mbs);
1699:   return(0);
1700: }

1702: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1703: {
1704:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1705:   IS                isrow=a->row;
1706:   PetscErrorCode    ierr;
1707:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1708:   const MatScalar   *aa=a->a,*v;
1709:   PetscReal         diagk;
1710:   PetscScalar       *x,xk;
1711:   const PetscScalar *b;
1712:   PetscInt          nz,k;

1715:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1716:   VecGetArrayRead(bb,&b);
1717:   VecGetArray(xx,&x);
1718:   ISGetIndices(isrow,&rp);

1720:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1721:   for (k=0; k<mbs; k++) {
1722:     v  = aa + ai[k] + 1;
1723:     vj = aj + ai[k] + 1;
1724:     xk = x[k];
1725:     nz = ai[k+1] - ai[k] - 1;
1726:     while (nz--) x[*vj++] += (*v++) * xk;

1728:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1729:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1730:     x[k] = xk*PetscSqrtReal(diagk);
1731:   }
1732:   ISRestoreIndices(isrow,&rp);
1733:   VecRestoreArrayRead(bb,&b);
1734:   VecRestoreArray(xx,&x);
1735:   PetscLogFlops(2.0*a->nz - mbs);
1736:   return(0);
1737: }

1739: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1740: {
1741:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1742:   IS                isrow=a->row;
1743:   PetscErrorCode    ierr;
1744:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1745:   const MatScalar   *aa=a->a,*v;
1746:   PetscReal         diagk;
1747:   PetscScalar       *x,*t;
1748:   const PetscScalar *b;
1749:   PetscInt          nz,k;

1752:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1753:   VecGetArrayRead(bb,&b);
1754:   VecGetArray(xx,&x);
1755:   t    = a->solve_work;
1756:   ISGetIndices(isrow,&rp);

1758:   for (k=mbs-1; k>=0; k--) {
1759:     v     = aa + ai[k];
1760:     vj    = aj + ai[k];
1761:     diagk = PetscRealPart(aa[adiag[k]]);
1762:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1763:     t[k] = b[k] * PetscSqrtReal(diagk);
1764:     nz   = ai[k+1] - ai[k] - 1;
1765:     while (nz--) t[k] += (*v++) * t[*vj++];
1766:     x[rp[k]] = t[k];
1767:   }
1768:   ISRestoreIndices(isrow,&rp);
1769:   VecRestoreArrayRead(bb,&b);
1770:   VecRestoreArray(xx,&x);
1771:   PetscLogFlops(2.0*a->nz - mbs);
1772:   return(0);
1773: }

1775: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1776: {
1777:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1778:   IS                isrow=a->row;
1779:   PetscErrorCode    ierr;
1780:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1781:   const MatScalar   *aa=a->a,*v;
1782:   PetscReal         diagk;
1783:   PetscScalar       *x,*t;
1784:   const PetscScalar *b;
1785:   PetscInt          nz,k;

1788:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1789:   VecGetArrayRead(bb,&b);
1790:   VecGetArray(xx,&x);
1791:   t    = a->solve_work;
1792:   ISGetIndices(isrow,&rp);

1794:   for (k=mbs-1; k>=0; k--) {
1795:     v     = aa + ai[k] + 1;
1796:     vj    = aj + ai[k] + 1;
1797:     diagk = PetscRealPart(aa[ai[k]]);
1798:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1799:     t[k] = b[k] * PetscSqrtReal(diagk);
1800:     nz   = ai[k+1] - ai[k] - 1;
1801:     while (nz--) t[k] += (*v++) * t[*vj++];
1802:     x[rp[k]] = t[k];
1803:   }
1804:   ISRestoreIndices(isrow,&rp);
1805:   VecRestoreArrayRead(bb,&b);
1806:   VecRestoreArray(xx,&x);
1807:   PetscLogFlops(2.0*a->nz - mbs);
1808:   return(0);
1809: }

1811: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1812: {
1813:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1817:   if (A->rmap->bs == 1) {
1818:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1819:   } else {
1820:     IS                isrow=a->row;
1821:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1822:     const MatScalar   *aa=a->a,*v;
1823:     PetscScalar       *x,*t;
1824:     const PetscScalar *b;
1825:     PetscInt          nz,k,n,i,j;

1827:     if (bb->n > a->solves_work_n) {
1828:       PetscFree(a->solves_work);
1829:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1830:       a->solves_work_n = bb->n;
1831:     }
1832:     n    = bb->n;
1833:     VecGetArrayRead(bb->v,&b);
1834:     VecGetArray(xx->v,&x);
1835:     t    = a->solves_work;

1837:     ISGetIndices(isrow,&rp);

1839:     /* solve U^T*D*y = perm(b) by forward substitution */
1840:     for (k=0; k<mbs; k++) {
1841:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1842:     }
1843:     for (k=0; k<mbs; k++) {
1844:       v  = aa + ai[k];
1845:       vj = aj + ai[k];
1846:       nz = ai[k+1] - ai[k] - 1;
1847:       for (j=0; j<nz; j++) {
1848:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1849:         v++;vj++;
1850:       }
1851:       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1852:     }

1854:     /* solve U*perm(x) = y by back substitution */
1855:     for (k=mbs-1; k>=0; k--) {
1856:       v  = aa + ai[k] - 1;
1857:       vj = aj + ai[k] - 1;
1858:       nz = ai[k+1] - ai[k] - 1;
1859:       for (j=0; j<nz; j++) {
1860:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1861:         v++;vj++;
1862:       }
1863:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1864:     }

1866:     ISRestoreIndices(isrow,&rp);
1867:     VecRestoreArrayRead(bb->v,&b);
1868:     VecRestoreArray(xx->v,&x);
1869:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1870:   }
1871:   return(0);
1872: }

1874: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1875: {
1876:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1880:   if (A->rmap->bs == 1) {
1881:     MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1882:   } else {
1883:     IS                isrow=a->row;
1884:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1885:     const MatScalar   *aa=a->a,*v;
1886:     PetscScalar       *x,*t;
1887:     const PetscScalar *b;
1888:     PetscInt          nz,k,n,i;

1890:     if (bb->n > a->solves_work_n) {
1891:       PetscFree(a->solves_work);
1892:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1893:       a->solves_work_n = bb->n;
1894:     }
1895:     n    = bb->n;
1896:     VecGetArrayRead(bb->v,&b);
1897:     VecGetArray(xx->v,&x);
1898:     t    = a->solves_work;

1900:     ISGetIndices(isrow,&rp);

1902:     /* solve U^T*D*y = perm(b) by forward substitution */
1903:     for (k=0; k<mbs; k++) {
1904:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1905:     }
1906:     for (k=0; k<mbs; k++) {
1907:       v  = aa + ai[k];
1908:       vj = aj + ai[k];
1909:       nz = ai[k+1] - ai[k];
1910:       while (nz--) {
1911:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1912:         v++;vj++;
1913:       }
1914:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1915:     }

1917:     /* solve U*perm(x) = y by back substitution */
1918:     for (k=mbs-1; k>=0; k--) {
1919:       v  = aa + ai[k];
1920:       vj = aj + ai[k];
1921:       nz = ai[k+1] - ai[k];
1922:       while (nz--) {
1923:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1924:         v++;vj++;
1925:       }
1926:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1927:     }

1929:     ISRestoreIndices(isrow,&rp);
1930:     VecRestoreArrayRead(bb->v,&b);
1931:     VecRestoreArray(xx->v,&x);
1932:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1933:   }
1934:   return(0);
1935: }

1937: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1938: {
1939:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1940:   PetscErrorCode    ierr;
1941:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1942:   const MatScalar   *aa=a->a,*v;
1943:   const PetscScalar *b;
1944:   PetscScalar       *x,xi;
1945:   PetscInt          nz,i,j;

1948:   VecGetArrayRead(bb,&b);
1949:   VecGetArray(xx,&x);
1950:   /* solve U^T*D*y = b by forward substitution */
1951:   PetscArraycpy(x,b,mbs);
1952:   for (i=0; i<mbs; i++) {
1953:     v  = aa + ai[i];
1954:     vj = aj + ai[i];
1955:     xi = x[i];
1956:     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1957:     for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1958:     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1959:   }
1960:   /* solve U*x = y by backward substitution */
1961:   for (i=mbs-2; i>=0; i--) {
1962:     xi = x[i];
1963:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1964:     vj = aj + adiag[i] - 1;
1965:     nz = ai[i+1] - ai[i] - 1;
1966:     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1967:     x[i] = xi;
1968:   }
1969:   VecRestoreArrayRead(bb,&b);
1970:   VecRestoreArray(xx,&x);
1971:   PetscLogFlops(4.0*a->nz - 3*mbs);
1972:   return(0);
1973: }

1975: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1976: {
1977:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1978:   PetscErrorCode    ierr;
1979:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1980:   const MatScalar   *aa=a->a,*v;
1981:   const PetscScalar *b;
1982:   PetscScalar       *x,xi;
1983:   PetscInt          nz,i,j,neq,ldb,ldx;
1984:   PetscBool         isdense;

1987:   if (!mbs) return(0);
1988:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);
1989:   if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1990:   if (X != B) {
1991:     PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);
1992:     if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1993:   }
1994:   MatDenseGetArrayRead(B,&b);
1995:   MatDenseGetLDA(B,&ldb);
1996:   MatDenseGetArray(X,&x);
1997:   MatDenseGetLDA(X,&ldx);
1998:   for (neq=0; neq<B->cmap->n; neq++) {
1999:     /* solve U^T*D*y = b by forward substitution */
2000:     PetscArraycpy(x,b,mbs);
2001:     for (i=0; i<mbs; i++) {
2002:       v  = aa + ai[i];
2003:       vj = aj + ai[i];
2004:       xi = x[i];
2005:       nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2006:       for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
2007:       x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2008:     }
2009:     /* solve U*x = y by backward substitution */
2010:     for (i=mbs-2; i>=0; i--) {
2011:       xi = x[i];
2012:       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2013:       vj = aj + adiag[i] - 1;
2014:       nz = ai[i+1] - ai[i] - 1;
2015:       for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2016:       x[i] = xi;
2017:     }
2018:     b += ldb;
2019:     x += ldx;
2020:   }
2021:   MatDenseRestoreArrayRead(B,&b);
2022:   MatDenseRestoreArray(X,&x);
2023:   PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));
2024:   return(0);
2025: }

2027: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2028: {
2029:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2030:   PetscErrorCode    ierr;
2031:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2032:   const MatScalar   *aa=a->a,*v;
2033:   PetscScalar       *x,xk;
2034:   const PetscScalar *b;
2035:   PetscInt          nz,k;

2038:   VecGetArrayRead(bb,&b);
2039:   VecGetArray(xx,&x);

2041:   /* solve U^T*D*y = b by forward substitution */
2042:   PetscArraycpy(x,b,mbs);
2043:   for (k=0; k<mbs; k++) {
2044:     v  = aa + ai[k] + 1;
2045:     vj = aj + ai[k] + 1;
2046:     xk = x[k];
2047:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2048:     while (nz--) x[*vj++] += (*v++) * xk;
2049:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2050:   }

2052:   /* solve U*x = y by back substitution */
2053:   for (k=mbs-2; k>=0; k--) {
2054:     v  = aa + ai[k] + 1;
2055:     vj = aj + ai[k] + 1;
2056:     xk = x[k];
2057:     nz = ai[k+1] - ai[k] - 1;
2058:     while (nz--) {
2059:       xk += (*v++) * x[*vj++];
2060:     }
2061:     x[k] = xk;
2062:   }

2064:   VecRestoreArrayRead(bb,&b);
2065:   VecRestoreArray(xx,&x);
2066:   PetscLogFlops(4.0*a->nz - 3*mbs);
2067:   return(0);
2068: }

2070: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2071: {
2072:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2073:   PetscErrorCode    ierr;
2074:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2075:   const MatScalar   *aa=a->a,*v;
2076:   PetscReal         diagk;
2077:   PetscScalar       *x;
2078:   const PetscScalar *b;
2079:   PetscInt          nz,k;

2082:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2083:   VecGetArrayRead(bb,&b);
2084:   VecGetArray(xx,&x);
2085:   PetscArraycpy(x,b,mbs);
2086:   for (k=0; k<mbs; k++) {
2087:     v  = aa + ai[k];
2088:     vj = aj + ai[k];
2089:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2090:     while (nz--) x[*vj++] += (*v++) * x[k];
2091:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2092:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2093:     x[k] *= PetscSqrtReal(diagk);
2094:   }
2095:   VecRestoreArrayRead(bb,&b);
2096:   VecRestoreArray(xx,&x);
2097:   PetscLogFlops(2.0*a->nz - mbs);
2098:   return(0);
2099: }

2101: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2102: {
2103:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2104:   PetscErrorCode    ierr;
2105:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2106:   const MatScalar   *aa=a->a,*v;
2107:   PetscReal         diagk;
2108:   PetscScalar       *x;
2109:   const PetscScalar *b;
2110:   PetscInt          nz,k;

2113:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2114:   VecGetArrayRead(bb,&b);
2115:   VecGetArray(xx,&x);
2116:   PetscArraycpy(x,b,mbs);
2117:   for (k=0; k<mbs; k++) {
2118:     v  = aa + ai[k] + 1;
2119:     vj = aj + ai[k] + 1;
2120:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2121:     while (nz--) x[*vj++] += (*v++) * x[k];
2122:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2123:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2124:     x[k] *= PetscSqrtReal(diagk);
2125:   }
2126:   VecRestoreArrayRead(bb,&b);
2127:   VecRestoreArray(xx,&x);
2128:   PetscLogFlops(2.0*a->nz - mbs);
2129:   return(0);
2130: }

2132: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2133: {
2134:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2135:   PetscErrorCode    ierr;
2136:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2137:   const MatScalar   *aa=a->a,*v;
2138:   PetscReal         diagk;
2139:   PetscScalar       *x;
2140:   const PetscScalar *b;
2141:   PetscInt          nz,k;

2144:   /* solve D^(1/2)*U*x = b by back substitution */
2145:   VecGetArrayRead(bb,&b);
2146:   VecGetArray(xx,&x);

2148:   for (k=mbs-1; k>=0; k--) {
2149:     v     = aa + ai[k];
2150:     vj    = aj + ai[k];
2151:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2152:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2153:     x[k] = PetscSqrtReal(diagk)*b[k];
2154:     nz   = ai[k+1] - ai[k] - 1;
2155:     while (nz--) x[k] += (*v++) * x[*vj++];
2156:   }
2157:   VecRestoreArrayRead(bb,&b);
2158:   VecRestoreArray(xx,&x);
2159:   PetscLogFlops(2.0*a->nz - mbs);
2160:   return(0);
2161: }

2163: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2164: {
2165:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2166:   PetscErrorCode    ierr;
2167:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2168:   const MatScalar   *aa=a->a,*v;
2169:   PetscReal         diagk;
2170:   PetscScalar       *x;
2171:   const PetscScalar *b;
2172:   PetscInt          nz,k;

2175:   /* solve D^(1/2)*U*x = b by back substitution */
2176:   VecGetArrayRead(bb,&b);
2177:   VecGetArray(xx,&x);

2179:   for (k=mbs-1; k>=0; k--) {
2180:     v     = aa + ai[k] + 1;
2181:     vj    = aj + ai[k] + 1;
2182:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2183:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2184:     x[k] = PetscSqrtReal(diagk)*b[k];
2185:     nz   = ai[k+1] - ai[k] - 1;
2186:     while (nz--) x[k] += (*v++) * x[*vj++];
2187:   }
2188:   VecRestoreArrayRead(bb,&b);
2189:   VecRestoreArray(xx,&x);
2190:   PetscLogFlops(2.0*a->nz - mbs);
2191:   return(0);
2192: }

2194: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2195: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2196: {
2197:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2199:   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2200:   PetscInt       *jutmp,bs = A->rmap->bs,i;
2201:   PetscInt       m,reallocs = 0,*levtmp;
2202:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2203:   PetscInt       incrlev,*lev,shift,prow,nz;
2204:   PetscReal      f = info->fill,levels = info->levels;
2205:   PetscBool      perm_identity;

2208:   /* check whether perm is the identity mapping */
2209:   ISIdentity(perm,&perm_identity);

2211:   if (perm_identity) {
2212:     a->permute = PETSC_FALSE;
2213:     ai         = a->i; aj = a->j;
2214:   } else { /*  non-trivial permutation */
2215:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2216:   }

2218:   /* initialization */
2219:   ISGetIndices(perm,&rip);
2220:   umax  = (PetscInt)(f*ai[mbs] + 1);
2221:   PetscMalloc1(umax,&lev);
2222:   umax += mbs + 1;
2223:   shift = mbs + 1;
2224:   PetscMalloc1(mbs+1,&iu);
2225:   PetscMalloc1(umax,&ju);
2226:   iu[0] = mbs + 1;
2227:   juidx = mbs + 1;
2228:   /* prowl: linked list for pivot row */
2229:   PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2230:   /* q: linked list for col index */

2232:   for (i=0; i<mbs; i++) {
2233:     prowl[i] = mbs;
2234:     q[i]     = 0;
2235:   }

2237:   /* for each row k */
2238:   for (k=0; k<mbs; k++) {
2239:     nzk  = 0;
2240:     q[k] = mbs;
2241:     /* copy current row into linked list */
2242:     nz = ai[rip[k]+1] - ai[rip[k]];
2243:     j  = ai[rip[k]];
2244:     while (nz--) {
2245:       vj = rip[aj[j++]];
2246:       if (vj > k) {
2247:         qm = k;
2248:         do {
2249:           m = qm; qm = q[m];
2250:         } while (qm < vj);
2251:         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2252:         nzk++;
2253:         q[m]       = vj;
2254:         q[vj]      = qm;
2255:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2256:       }
2257:     }

2259:     /* modify nonzero structure of k-th row by computing fill-in
2260:        for each row prow to be merged in */
2261:     prow = k;
2262:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2264:     while (prow < k) {
2265:       /* merge row prow into k-th row */
2266:       jmin = iu[prow] + 1;
2267:       jmax = iu[prow+1];
2268:       qm   = k;
2269:       for (j=jmin; j<jmax; j++) {
2270:         incrlev = lev[j-shift] + 1;
2271:         if (incrlev > levels) continue;
2272:         vj = ju[j];
2273:         do {
2274:           m = qm; qm = q[m];
2275:         } while (qm < vj);
2276:         if (qm != vj) {      /* a fill */
2277:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2278:           levtmp[vj] = incrlev;
2279:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2280:       }
2281:       prow = prowl[prow]; /* next pivot row */
2282:     }

2284:     /* add k to row list for first nonzero element in k-th row */
2285:     if (nzk > 1) {
2286:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2287:       prowl[k] = prowl[i]; prowl[i] = k;
2288:     }
2289:     iu[k+1] = iu[k] + nzk;

2291:     /* allocate more space to ju and lev if needed */
2292:     if (iu[k+1] > umax) {
2293:       /* estimate how much additional space we will need */
2294:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2295:       /* just double the memory each time */
2296:       maxadd = umax;
2297:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2298:       umax += maxadd;

2300:       /* allocate a longer ju */
2301:       PetscMalloc1(umax,&jutmp);
2302:       PetscArraycpy(jutmp,ju,iu[k]);
2303:       PetscFree(ju);
2304:       ju   = jutmp;

2306:       PetscMalloc1(umax,&jutmp);
2307:       PetscArraycpy(jutmp,lev,iu[k]-shift);
2308:       PetscFree(lev);
2309:       lev       = jutmp;
2310:       reallocs += 2; /* count how many times we realloc */
2311:     }

2313:     /* save nonzero structure of k-th row in ju */
2314:     i=k;
2315:     while (nzk--) {
2316:       i                = q[i];
2317:       ju[juidx]        = i;
2318:       lev[juidx-shift] = levtmp[i];
2319:       juidx++;
2320:     }
2321:   }

2323: #if defined(PETSC_USE_INFO)
2324:   if (ai[mbs] != 0) {
2325:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2326:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2327:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2328:     PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2329:     PetscInfo(A,"for best performance.\n");
2330:   } else {
2331:     PetscInfo(A,"Empty matrix\n");
2332:   }
2333: #endif

2335:   ISRestoreIndices(perm,&rip);
2336:   PetscFree3(prowl,q,levtmp);
2337:   PetscFree(lev);

2339:   /* put together the new matrix */
2340:   MatSeqSBAIJSetPreallocation(B,bs,0,NULL);

2342:   /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2343:   b    = (Mat_SeqSBAIJ*)(B)->data;
2344:   PetscFree2(b->imax,b->ilen);

2346:   b->singlemalloc = PETSC_FALSE;
2347:   b->free_a       = PETSC_TRUE;
2348:   b->free_ij      = PETSC_TRUE;
2349:   /* the next line frees the default space generated by the Create() */
2350:   PetscFree3(b->a,b->j,b->i);
2351:   PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2352:   b->j    = ju;
2353:   b->i    = iu;
2354:   b->diag = NULL;
2355:   b->ilen = NULL;
2356:   b->imax = NULL;

2358:   ISDestroy(&b->row);
2359:   ISDestroy(&b->icol);
2360:   b->row  = perm;
2361:   b->icol = perm;
2362:   PetscObjectReference((PetscObject)perm);
2363:   PetscObjectReference((PetscObject)perm);
2364:   PetscMalloc1(bs*mbs+bs,&b->solve_work);
2365:   /* In b structure:  Free imax, ilen, old a, old j.
2366:      Allocate idnew, solve_work, new a, new j */
2367:   PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2368:   b->maxnz = b->nz = iu[mbs];

2370:   (B)->info.factor_mallocs   = reallocs;
2371:   (B)->info.fill_ratio_given = f;
2372:   if (ai[mbs] != 0) {
2373:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2374:   } else {
2375:     (B)->info.fill_ratio_needed = 0.0;
2376:   }
2377:   MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2378:   return(0);
2379: }

2381: /*
2382:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2383: */
2384: #include <petscbt.h>
2385: #include <../src/mat/utils/freespace.h>
2386: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2387: {
2388:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2389:   PetscErrorCode     ierr;
2390:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2391:   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2392:   const PetscInt     *rip;
2393:   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2394:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2395:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2396:   PetscReal          fill          =info->fill,levels=info->levels;
2397:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2398:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2399:   PetscBT            lnkbt;

2402:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2403:   MatMissingDiagonal(A,&missing,&d);
2404:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2405:   if (bs > 1) {
2406:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2407:     return(0);
2408:   }

2410:   /* check whether perm is the identity mapping */
2411:   ISIdentity(perm,&perm_identity);
2412:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2413:   a->permute = PETSC_FALSE;

2415:   PetscMalloc1(am+1,&ui);
2416:   PetscMalloc1(am+1,&udiag);
2417:   ui[0] = 0;

2419:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2420:   if (!levels) {
2421:     /* reuse the column pointers and row offsets for memory savings */
2422:     for (i=0; i<am; i++) {
2423:       ncols    = ai[i+1] - ai[i];
2424:       ui[i+1]  = ui[i] + ncols;
2425:       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2426:     }
2427:     PetscMalloc1(ui[am]+1,&uj);
2428:     cols = uj;
2429:     for (i=0; i<am; i++) {
2430:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2431:       ncols = ai[i+1] - ai[i] -1;
2432:       for (j=0; j<ncols; j++) *cols++ = aj[j];
2433:       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2434:     }
2435:   } else { /* case: levels>0 */
2436:     ISGetIndices(perm,&rip);

2438:     /* initialization */
2439:     /* jl: linked list for storing indices of the pivot rows
2440:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2441:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2442:     for (i=0; i<am; i++) {
2443:       jl[i] = am; il[i] = 0;
2444:     }

2446:     /* create and initialize a linked list for storing column indices of the active row k */
2447:     nlnk = am + 1;
2448:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2450:     /* initial FreeSpace size is fill*(ai[am]+1) */
2451:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2453:     current_space = free_space;

2455:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2457:     current_space_lvl = free_space_lvl;

2459:     for (k=0; k<am; k++) {  /* for each active row k */
2460:       /* initialize lnk by the column indices of row k */
2461:       nzk   = 0;
2462:       ncols = ai[k+1] - ai[k];
2463:       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2464:       cols = aj+ai[k];
2465:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2466:       nzk += nlnk;

2468:       /* update lnk by computing fill-in for each pivot row to be merged in */
2469:       prow = jl[k]; /* 1st pivot row */

2471:       while (prow < k) {
2472:         nextprow = jl[prow];

2474:         /* merge prow into k-th row */
2475:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2476:         jmax  = ui[prow+1];
2477:         ncols = jmax-jmin;
2478:         i     = jmin - ui[prow];

2480:         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2481:         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2482:         j    = *(uj - 1);
2483:         PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2484:         nzk += nlnk;

2486:         /* update il and jl for prow */
2487:         if (jmin < jmax) {
2488:           il[prow] = jmin;

2490:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2491:         }
2492:         prow = nextprow;
2493:       }

2495:       /* if free space is not available, make more free space */
2496:       if (current_space->local_remaining<nzk) {
2497:         i    = am - k + 1; /* num of unfactored rows */
2498:         i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2499:         PetscFreeSpaceGet(i,&current_space);
2500:         PetscFreeSpaceGet(i,&current_space_lvl);
2501:         reallocs++;
2502:       }

2504:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2505:       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2506:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2508:       /* add the k-th row into il and jl */
2509:       if (nzk > 1) {
2510:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2511:         jl[k] = jl[i]; jl[i] = k;
2512:         il[k] = ui[k] + 1;
2513:       }
2514:       uj_ptr[k]     = current_space->array;
2515:       uj_lvl_ptr[k] = current_space_lvl->array;

2517:       current_space->array               += nzk;
2518:       current_space->local_used          += nzk;
2519:       current_space->local_remaining     -= nzk;
2520:       current_space_lvl->array           += nzk;
2521:       current_space_lvl->local_used      += nzk;
2522:       current_space_lvl->local_remaining -= nzk;

2524:       ui[k+1] = ui[k] + nzk;
2525:     }

2527:     ISRestoreIndices(perm,&rip);
2528:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2530:     /* destroy list of free space and other temporary array(s) */
2531:     PetscMalloc1(ui[am]+1,&uj);
2532:     PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor  */
2533:     PetscIncompleteLLDestroy(lnk,lnkbt);
2534:     PetscFreeSpaceDestroy(free_space_lvl);

2536:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2538:   /* put together the new matrix in MATSEQSBAIJ format */
2539:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2541:   b    = (Mat_SeqSBAIJ*)(fact)->data;
2542:   PetscFree2(b->imax,b->ilen);

2544:   b->singlemalloc = PETSC_FALSE;
2545:   b->free_a       = PETSC_TRUE;
2546:   b->free_ij      = free_ij;

2548:   PetscMalloc1(ui[am]+1,&b->a);

2550:   b->j         = uj;
2551:   b->i         = ui;
2552:   b->diag      = udiag;
2553:   b->free_diag = PETSC_TRUE;
2554:   b->ilen      = NULL;
2555:   b->imax      = NULL;
2556:   b->row       = perm;
2557:   b->col       = perm;

2559:   PetscObjectReference((PetscObject)perm);
2560:   PetscObjectReference((PetscObject)perm);

2562:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2564:   PetscMalloc1(am+1,&b->solve_work);
2565:   PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));

2567:   b->maxnz = b->nz = ui[am];

2569:   fact->info.factor_mallocs   = reallocs;
2570:   fact->info.fill_ratio_given = fill;
2571:   if (ai[am] != 0) {
2572:     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2573:   } else {
2574:     fact->info.fill_ratio_needed = 0.0;
2575:   }
2576: #if defined(PETSC_USE_INFO)
2577:   if (ai[am] != 0) {
2578:     PetscReal af = fact->info.fill_ratio_needed;
2579:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2580:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2581:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2582:   } else {
2583:     PetscInfo(A,"Empty matrix\n");
2584:   }
2585: #endif
2586:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2587:   return(0);
2588: }

2590: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2591: {
2592:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2593:   Mat_SeqSBAIJ       *b;
2594:   PetscErrorCode     ierr;
2595:   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2596:   PetscInt           bs=A->rmap->bs,am=a->mbs;
2597:   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2598:   PetscInt           reallocs=0,i,*ui;
2599:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2600:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2601:   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2602:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2603:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2604:   PetscBT            lnkbt;

2607:   /*
2608:    This code originally uses Modified Sparse Row (MSR) storage
2609:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2610:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2611:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2612:    thus the original code in MSR format is still used for these cases.
2613:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2614:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2615:   */
2616:   if (bs > 1) {
2617:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2618:     return(0);
2619:   }

2621:   /* check whether perm is the identity mapping */
2622:   ISIdentity(perm,&perm_identity);
2623:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2624:   a->permute = PETSC_FALSE;

2626:   /* special case that simply copies fill pattern */
2627:   if (!levels) {
2628:     /* reuse the column pointers and row offsets for memory savings */
2629:     ui           = a->i;
2630:     uj           = a->j;
2631:     free_ij      = PETSC_FALSE;
2632:     ratio_needed = 1.0;
2633:   } else { /* case: levels>0 */
2634:     ISGetIndices(perm,&rip);

2636:     /* initialization */
2637:     PetscMalloc1(am+1,&ui);
2638:     ui[0] = 0;

2640:     /* jl: linked list for storing indices of the pivot rows
2641:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2642:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2643:     for (i=0; i<am; i++) {
2644:       jl[i] = am; il[i] = 0;
2645:     }

2647:     /* create and initialize a linked list for storing column indices of the active row k */
2648:     nlnk = am + 1;
2649:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2651:     /* initial FreeSpace size is fill*(ai[am]+1) */
2652:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2654:     current_space = free_space;

2656:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2658:     current_space_lvl = free_space_lvl;

2660:     for (k=0; k<am; k++) {  /* for each active row k */
2661:       /* initialize lnk by the column indices of row rip[k] */
2662:       nzk   = 0;
2663:       ncols = ai[rip[k]+1] - ai[rip[k]];
2664:       cols  = aj+ai[rip[k]];
2665:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2666:       nzk  += nlnk;

2668:       /* update lnk by computing fill-in for each pivot row to be merged in */
2669:       prow = jl[k]; /* 1st pivot row */

2671:       while (prow < k) {
2672:         nextprow = jl[prow];

2674:         /* merge prow into k-th row */
2675:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2676:         jmax     = ui[prow+1];
2677:         ncols    = jmax-jmin;
2678:         i        = jmin - ui[prow];
2679:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2680:         j        = *(uj_lvl_ptr[prow] + i - 1);
2681:         cols_lvl = uj_lvl_ptr[prow]+i;
2682:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2683:         nzk     += nlnk;

2685:         /* update il and jl for prow */
2686:         if (jmin < jmax) {
2687:           il[prow] = jmin;
2688:           j        = *cols;
2689:           jl[prow] = jl[j];
2690:           jl[j]    = prow;
2691:         }
2692:         prow = nextprow;
2693:       }

2695:       /* if free space is not available, make more free space */
2696:       if (current_space->local_remaining<nzk) {
2697:         i    = am - k + 1; /* num of unfactored rows */
2698:         i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2699:         PetscFreeSpaceGet(i,&current_space);
2700:         PetscFreeSpaceGet(i,&current_space_lvl);
2701:         reallocs++;
2702:       }

2704:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2705:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2707:       /* add the k-th row into il and jl */
2708:       if (nzk-1 > 0) {
2709:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2710:         jl[k] = jl[i]; jl[i] = k;
2711:         il[k] = ui[k] + 1;
2712:       }
2713:       uj_ptr[k]     = current_space->array;
2714:       uj_lvl_ptr[k] = current_space_lvl->array;

2716:       current_space->array               += nzk;
2717:       current_space->local_used          += nzk;
2718:       current_space->local_remaining     -= nzk;
2719:       current_space_lvl->array           += nzk;
2720:       current_space_lvl->local_used      += nzk;
2721:       current_space_lvl->local_remaining -= nzk;

2723:       ui[k+1] = ui[k] + nzk;
2724:     }

2726:     ISRestoreIndices(perm,&rip);
2727:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2729:     /* destroy list of free space and other temporary array(s) */
2730:     PetscMalloc1(ui[am]+1,&uj);
2731:     PetscFreeSpaceContiguous(&free_space,uj);
2732:     PetscIncompleteLLDestroy(lnk,lnkbt);
2733:     PetscFreeSpaceDestroy(free_space_lvl);
2734:     if (ai[am] != 0) {
2735:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2736:     } else {
2737:       ratio_needed = 0.0;
2738:     }
2739:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2741:   /* put together the new matrix in MATSEQSBAIJ format */
2742:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2744:   b = (Mat_SeqSBAIJ*)(fact)->data;

2746:   PetscFree2(b->imax,b->ilen);

2748:   b->singlemalloc = PETSC_FALSE;
2749:   b->free_a       = PETSC_TRUE;
2750:   b->free_ij      = free_ij;

2752:   PetscMalloc1(ui[am]+1,&b->a);

2754:   b->j             = uj;
2755:   b->i             = ui;
2756:   b->diag          = NULL;
2757:   b->ilen          = NULL;
2758:   b->imax          = NULL;
2759:   b->row           = perm;
2760:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2762:   PetscObjectReference((PetscObject)perm);
2763:   b->icol = perm;
2764:   PetscObjectReference((PetscObject)perm);
2765:   PetscMalloc1(am+1,&b->solve_work);

2767:   b->maxnz = b->nz = ui[am];

2769:   fact->info.factor_mallocs    = reallocs;
2770:   fact->info.fill_ratio_given  = fill;
2771:   fact->info.fill_ratio_needed = ratio_needed;
2772: #if defined(PETSC_USE_INFO)
2773:   if (ai[am] != 0) {
2774:     PetscReal af = fact->info.fill_ratio_needed;
2775:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2776:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2777:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2778:   } else {
2779:     PetscInfo(A,"Empty matrix\n");
2780:   }
2781: #endif
2782:   if (perm_identity) {
2783:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2784:   } else {
2785:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2786:   }
2787:   return(0);
2788: }