Actual source code: sbaijfact2.c
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <petsc/private/kernels/blockinvert.h>
10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
11: {
12: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
13: IS isrow=a->row;
14: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
15: PetscErrorCode ierr;
16: const PetscInt *r;
17: PetscInt nz,*vj,k,idx,k1;
18: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
19: const MatScalar *aa=a->a,*v,*diag;
20: PetscScalar *x,*xk,*xj,*xk_tmp,*t;
21: const PetscScalar *b;
24: VecGetArrayRead(bb,&b);
25: VecGetArray(xx,&x);
26: t = a->solve_work;
27: ISGetIndices(isrow,&r);
28: PetscMalloc1(bs,&xk_tmp);
30: /* solve U^T * D * y = b by forward substitution */
31: xk = t;
32: for (k=0; k<mbs; k++) { /* t <- perm(b) */
33: idx = bs*r[k];
34: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
35: }
36: for (k=0; k<mbs; k++) {
37: v = aa + bs2*ai[k];
38: xk = t + k*bs; /* Dk*xk = k-th block of x */
39: PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
40: nz = ai[k+1] - ai[k];
41: vj = aj + ai[k];
42: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
43: while (nz--) {
44: /* x(:) += U(k,:)^T*(Dk*xk) */
45: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
46: vj++; xj = t + (*vj)*bs;
47: v += bs2;
48: }
49: /* xk = inv(Dk)*(Dk*xk) */
50: diag = aa+k*bs2; /* ptr to inv(Dk) */
51: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
52: }
54: /* solve U*x = y by back substitution */
55: for (k=mbs-1; k>=0; k--) {
56: v = aa + bs2*ai[k];
57: xk = t + k*bs; /* xk */
58: nz = ai[k+1] - ai[k];
59: vj = aj + ai[k];
60: xj = t + (*vj)*bs;
61: while (nz--) {
62: /* xk += U(k,:)*x(:) */
63: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
64: vj++;
65: v += bs2; xj = t + (*vj)*bs;
66: }
67: idx = bs*r[k];
68: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
69: }
71: PetscFree(xk_tmp);
72: ISRestoreIndices(isrow,&r);
73: VecRestoreArrayRead(bb,&b);
74: VecRestoreArray(xx,&x);
75: PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
76: return(0);
77: }
79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
80: {
82: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
83: }
85: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
86: {
88: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
89: }
91: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
92: {
93: PetscErrorCode ierr;
94: PetscInt nz,k;
95: const PetscInt *vj,bs2 = bs*bs;
96: const MatScalar *v,*diag;
97: PetscScalar *xk,*xj,*xk_tmp;
100: PetscMalloc1(bs,&xk_tmp);
101: for (k=0; k<mbs; k++) {
102: v = aa + bs2*ai[k];
103: xk = x + k*bs; /* Dk*xk = k-th block of x */
104: PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
105: nz = ai[k+1] - ai[k];
106: vj = aj + ai[k];
107: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
108: while (nz--) {
109: /* x(:) += U(k,:)^T*(Dk*xk) */
110: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
111: vj++; xj = x + (*vj)*bs;
112: v += bs2;
113: }
114: /* xk = inv(Dk)*(Dk*xk) */
115: diag = aa+k*bs2; /* ptr to inv(Dk) */
116: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
117: }
118: PetscFree(xk_tmp);
119: return(0);
120: }
122: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
123: {
124: PetscInt nz,k;
125: const PetscInt *vj,bs2 = bs*bs;
126: const MatScalar *v;
127: PetscScalar *xk,*xj;
130: for (k=mbs-1; k>=0; k--) {
131: v = aa + bs2*ai[k];
132: xk = x + k*bs; /* xk */
133: nz = ai[k+1] - ai[k];
134: vj = aj + ai[k];
135: xj = x + (*vj)*bs;
136: while (nz--) {
137: /* xk += U(k,:)*x(:) */
138: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
139: vj++;
140: v += bs2; xj = x + (*vj)*bs;
141: }
142: }
143: return(0);
144: }
146: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
147: {
148: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
149: PetscErrorCode ierr;
150: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
151: PetscInt bs =A->rmap->bs;
152: const MatScalar *aa=a->a;
153: PetscScalar *x;
154: const PetscScalar *b;
157: VecGetArrayRead(bb,&b);
158: VecGetArray(xx,&x);
160: /* solve U^T * D * y = b by forward substitution */
161: PetscArraycpy(x,b,bs*mbs); /* x <- b */
162: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
164: /* solve U*x = y by back substitution */
165: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
167: VecRestoreArrayRead(bb,&b);
168: VecRestoreArray(xx,&x);
169: PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
170: return(0);
171: }
173: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
174: {
175: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
176: PetscErrorCode ierr;
177: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
178: PetscInt bs =A->rmap->bs;
179: const MatScalar *aa=a->a;
180: const PetscScalar *b;
181: PetscScalar *x;
184: VecGetArrayRead(bb,&b);
185: VecGetArray(xx,&x);
186: PetscArraycpy(x,b,bs*mbs); /* x <- b */
187: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
188: VecRestoreArrayRead(bb,&b);
189: VecRestoreArray(xx,&x);
190: PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
191: return(0);
192: }
194: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
195: {
196: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
197: PetscErrorCode ierr;
198: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
199: PetscInt bs =A->rmap->bs;
200: const MatScalar *aa=a->a;
201: const PetscScalar *b;
202: PetscScalar *x;
205: VecGetArrayRead(bb,&b);
206: VecGetArray(xx,&x);
207: PetscArraycpy(x,b,bs*mbs);
208: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
209: VecRestoreArrayRead(bb,&b);
210: VecRestoreArray(xx,&x);
211: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
212: return(0);
213: }
215: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
216: {
217: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
218: IS isrow=a->row;
219: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
220: PetscErrorCode ierr;
221: PetscInt nz,k,idx;
222: const MatScalar *aa=a->a,*v,*d;
223: PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
224: const PetscScalar *b;
227: VecGetArrayRead(bb,&b);
228: VecGetArray(xx,&x);
229: t = a->solve_work;
230: ISGetIndices(isrow,&r);
232: /* solve U^T * D * y = b by forward substitution */
233: tp = t;
234: for (k=0; k<mbs; k++) { /* t <- perm(b) */
235: idx = 7*r[k];
236: tp[0] = b[idx];
237: tp[1] = b[idx+1];
238: tp[2] = b[idx+2];
239: tp[3] = b[idx+3];
240: tp[4] = b[idx+4];
241: tp[5] = b[idx+5];
242: tp[6] = b[idx+6];
243: tp += 7;
244: }
246: for (k=0; k<mbs; k++) {
247: v = aa + 49*ai[k];
248: vj = aj + ai[k];
249: tp = t + k*7;
250: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
251: nz = ai[k+1] - ai[k];
252: tp = t + (*vj)*7;
253: while (nz--) {
254: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
255: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
256: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
257: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
258: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
259: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
260: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
261: vj++;
262: tp = t + (*vj)*7;
263: v += 49;
264: }
266: /* xk = inv(Dk)*(Dk*xk) */
267: d = aa+k*49; /* ptr to inv(Dk) */
268: tp = t + k*7;
269: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
270: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
271: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
272: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
273: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
274: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
275: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
276: }
278: /* solve U*x = y by back substitution */
279: for (k=mbs-1; k>=0; k--) {
280: v = aa + 49*ai[k];
281: vj = aj + ai[k];
282: tp = t + k*7;
283: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
284: nz = ai[k+1] - ai[k];
286: tp = t + (*vj)*7;
287: while (nz--) {
288: /* xk += U(k,:)*x(:) */
289: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
290: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
291: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
292: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
293: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
294: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
295: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
296: vj++;
297: tp = t + (*vj)*7;
298: v += 49;
299: }
300: tp = t + k*7;
301: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
302: idx = 7*r[k];
303: x[idx] = x0;
304: x[idx+1] = x1;
305: x[idx+2] = x2;
306: x[idx+3] = x3;
307: x[idx+4] = x4;
308: x[idx+5] = x5;
309: x[idx+6] = x6;
310: }
312: ISRestoreIndices(isrow,&r);
313: VecRestoreArrayRead(bb,&b);
314: VecRestoreArray(xx,&x);
315: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
316: return(0);
317: }
319: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
320: {
321: const MatScalar *v,*d;
322: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
323: PetscInt nz,k;
324: const PetscInt *vj;
327: for (k=0; k<mbs; k++) {
328: v = aa + 49*ai[k];
329: xp = x + k*7;
330: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
331: nz = ai[k+1] - ai[k];
332: vj = aj + ai[k];
333: xp = x + (*vj)*7;
334: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
335: PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
336: while (nz--) {
337: /* x(:) += U(k,:)^T*(Dk*xk) */
338: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
339: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
340: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
341: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
342: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
343: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
344: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
345: vj++;
346: xp = x + (*vj)*7;
347: v += 49;
348: }
349: /* xk = inv(Dk)*(Dk*xk) */
350: d = aa+k*49; /* ptr to inv(Dk) */
351: xp = x + k*7;
352: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
353: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
354: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
355: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
356: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
357: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
358: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
359: }
360: return(0);
361: }
363: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
364: {
365: const MatScalar *v;
366: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
367: PetscInt nz,k;
368: const PetscInt *vj;
371: for (k=mbs-1; k>=0; k--) {
372: v = aa + 49*ai[k];
373: xp = x + k*7;
374: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
375: nz = ai[k+1] - ai[k];
376: vj = aj + ai[k];
377: xp = x + (*vj)*7;
378: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
379: PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
380: while (nz--) {
381: /* xk += U(k,:)*x(:) */
382: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
383: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
384: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
385: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
386: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
387: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
388: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
389: vj++;
390: v += 49; xp = x + (*vj)*7;
391: }
392: xp = x + k*7;
393: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
394: }
395: return(0);
396: }
398: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
399: {
400: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
401: PetscErrorCode ierr;
402: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
403: const MatScalar *aa=a->a;
404: PetscScalar *x;
405: const PetscScalar *b;
408: VecGetArrayRead(bb,&b);
409: VecGetArray(xx,&x);
411: /* solve U^T * D * y = b by forward substitution */
412: PetscArraycpy(x,b,7*mbs); /* x <- b */
413: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
415: /* solve U*x = y by back substitution */
416: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
418: VecRestoreArrayRead(bb,&b);
419: VecRestoreArray(xx,&x);
420: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
421: return(0);
422: }
424: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425: {
426: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
427: PetscErrorCode ierr;
428: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
429: const MatScalar *aa=a->a;
430: PetscScalar *x;
431: const PetscScalar *b;
434: VecGetArrayRead(bb,&b);
435: VecGetArray(xx,&x);
436: PetscArraycpy(x,b,7*mbs);
437: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
438: VecRestoreArrayRead(bb,&b);
439: VecRestoreArray(xx,&x);
440: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
441: return(0);
442: }
444: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
445: {
446: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
447: PetscErrorCode ierr;
448: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
449: const MatScalar *aa=a->a;
450: PetscScalar *x;
451: const PetscScalar *b;
454: VecGetArrayRead(bb,&b);
455: VecGetArray(xx,&x);
456: PetscArraycpy(x,b,7*mbs);
457: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
458: VecRestoreArrayRead(bb,&b);
459: VecRestoreArray(xx,&x);
460: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
461: return(0);
462: }
464: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
465: {
466: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
467: IS isrow=a->row;
468: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
469: PetscErrorCode ierr;
470: PetscInt nz,k,idx;
471: const MatScalar *aa=a->a,*v,*d;
472: PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp;
473: const PetscScalar *b;
476: VecGetArrayRead(bb,&b);
477: VecGetArray(xx,&x);
478: t = a->solve_work;
479: ISGetIndices(isrow,&r);
481: /* solve U^T * D * y = b by forward substitution */
482: tp = t;
483: for (k=0; k<mbs; k++) { /* t <- perm(b) */
484: idx = 6*r[k];
485: tp[0] = b[idx];
486: tp[1] = b[idx+1];
487: tp[2] = b[idx+2];
488: tp[3] = b[idx+3];
489: tp[4] = b[idx+4];
490: tp[5] = b[idx+5];
491: tp += 6;
492: }
494: for (k=0; k<mbs; k++) {
495: v = aa + 36*ai[k];
496: vj = aj + ai[k];
497: tp = t + k*6;
498: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
499: nz = ai[k+1] - ai[k];
500: tp = t + (*vj)*6;
501: while (nz--) {
502: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
503: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
504: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
505: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
506: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
507: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
508: vj++;
509: tp = t + (*vj)*6;
510: v += 36;
511: }
513: /* xk = inv(Dk)*(Dk*xk) */
514: d = aa+k*36; /* ptr to inv(Dk) */
515: tp = t + k*6;
516: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
517: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
518: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
519: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
520: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
521: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
522: }
524: /* solve U*x = y by back substitution */
525: for (k=mbs-1; k>=0; k--) {
526: v = aa + 36*ai[k];
527: vj = aj + ai[k];
528: tp = t + k*6;
529: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
530: nz = ai[k+1] - ai[k];
532: tp = t + (*vj)*6;
533: while (nz--) {
534: /* xk += U(k,:)*x(:) */
535: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
536: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
537: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
538: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
539: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
540: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
541: vj++;
542: tp = t + (*vj)*6;
543: v += 36;
544: }
545: tp = t + k*6;
546: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
547: idx = 6*r[k];
548: x[idx] = x0;
549: x[idx+1] = x1;
550: x[idx+2] = x2;
551: x[idx+3] = x3;
552: x[idx+4] = x4;
553: x[idx+5] = x5;
554: }
556: ISRestoreIndices(isrow,&r);
557: VecRestoreArrayRead(bb,&b);
558: VecRestoreArray(xx,&x);
559: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
560: return(0);
561: }
563: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
564: {
565: const MatScalar *v,*d;
566: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
567: PetscInt nz,k;
568: const PetscInt *vj;
571: for (k=0; k<mbs; k++) {
572: v = aa + 36*ai[k];
573: xp = x + k*6;
574: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
575: nz = ai[k+1] - ai[k];
576: vj = aj + ai[k];
577: xp = x + (*vj)*6;
578: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
579: PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580: while (nz--) {
581: /* x(:) += U(k,:)^T*(Dk*xk) */
582: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
583: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
584: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
585: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
586: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
587: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
588: vj++;
589: xp = x + (*vj)*6;
590: v += 36;
591: }
592: /* xk = inv(Dk)*(Dk*xk) */
593: d = aa+k*36; /* ptr to inv(Dk) */
594: xp = x + k*6;
595: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
596: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
597: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
598: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
599: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
600: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
601: }
602: return(0);
603: }
604: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
605: {
606: const MatScalar *v;
607: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
608: PetscInt nz,k;
609: const PetscInt *vj;
612: for (k=mbs-1; k>=0; k--) {
613: v = aa + 36*ai[k];
614: xp = x + k*6;
615: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
616: nz = ai[k+1] - ai[k];
617: vj = aj + ai[k];
618: xp = x + (*vj)*6;
619: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
620: PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
621: while (nz--) {
622: /* xk += U(k,:)*x(:) */
623: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
624: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
625: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
626: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
627: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
628: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
629: vj++;
630: v += 36; xp = x + (*vj)*6;
631: }
632: xp = x + k*6;
633: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
634: }
635: return(0);
636: }
638: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
639: {
640: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
641: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
642: const MatScalar *aa=a->a;
643: PetscScalar *x;
644: const PetscScalar *b;
645: PetscErrorCode ierr;
648: VecGetArrayRead(bb,&b);
649: VecGetArray(xx,&x);
651: /* solve U^T * D * y = b by forward substitution */
652: PetscArraycpy(x,b,6*mbs); /* x <- b */
653: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
655: /* solve U*x = y by back substitution */
656: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
658: VecRestoreArrayRead(bb,&b);
659: VecRestoreArray(xx,&x);
660: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
661: return(0);
662: }
664: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
665: {
666: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
667: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
668: const MatScalar *aa=a->a;
669: PetscScalar *x;
670: const PetscScalar *b;
671: PetscErrorCode ierr;
674: VecGetArrayRead(bb,&b);
675: VecGetArray(xx,&x);
676: PetscArraycpy(x,b,6*mbs); /* x <- b */
677: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
678: VecRestoreArrayRead(bb,&b);
679: VecRestoreArray(xx,&x);
680: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
681: return(0);
682: }
684: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
685: {
686: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
687: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
688: const MatScalar *aa=a->a;
689: PetscScalar *x;
690: const PetscScalar *b;
691: PetscErrorCode ierr;
694: VecGetArrayRead(bb,&b);
695: VecGetArray(xx,&x);
696: PetscArraycpy(x,b,6*mbs); /* x <- b */
697: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
698: VecRestoreArrayRead(bb,&b);
699: VecRestoreArray(xx,&x);
700: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
701: return(0);
702: }
704: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
705: {
706: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
707: IS isrow=a->row;
708: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
709: PetscErrorCode ierr;
710: const PetscInt *r,*vj;
711: PetscInt nz,k,idx;
712: const MatScalar *aa=a->a,*v,*diag;
713: PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp;
714: const PetscScalar *b;
717: VecGetArrayRead(bb,&b);
718: VecGetArray(xx,&x);
719: t = a->solve_work;
720: ISGetIndices(isrow,&r);
722: /* solve U^T * D * y = b by forward substitution */
723: tp = t;
724: for (k=0; k<mbs; k++) { /* t <- perm(b) */
725: idx = 5*r[k];
726: tp[0] = b[idx];
727: tp[1] = b[idx+1];
728: tp[2] = b[idx+2];
729: tp[3] = b[idx+3];
730: tp[4] = b[idx+4];
731: tp += 5;
732: }
734: for (k=0; k<mbs; k++) {
735: v = aa + 25*ai[k];
736: vj = aj + ai[k];
737: tp = t + k*5;
738: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
739: nz = ai[k+1] - ai[k];
741: tp = t + (*vj)*5;
742: while (nz--) {
743: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
744: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
745: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
746: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
747: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
748: vj++;
749: tp = t + (*vj)*5;
750: v += 25;
751: }
753: /* xk = inv(Dk)*(Dk*xk) */
754: diag = aa+k*25; /* ptr to inv(Dk) */
755: tp = t + k*5;
756: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
757: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
758: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
759: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
760: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
761: }
763: /* solve U*x = y by back substitution */
764: for (k=mbs-1; k>=0; k--) {
765: v = aa + 25*ai[k];
766: vj = aj + ai[k];
767: tp = t + k*5;
768: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
769: nz = ai[k+1] - ai[k];
771: tp = t + (*vj)*5;
772: while (nz--) {
773: /* xk += U(k,:)*x(:) */
774: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
775: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
776: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
777: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
778: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
779: vj++;
780: tp = t + (*vj)*5;
781: v += 25;
782: }
783: tp = t + k*5;
784: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
785: idx = 5*r[k];
786: x[idx] = x0;
787: x[idx+1] = x1;
788: x[idx+2] = x2;
789: x[idx+3] = x3;
790: x[idx+4] = x4;
791: }
793: ISRestoreIndices(isrow,&r);
794: VecRestoreArrayRead(bb,&b);
795: VecRestoreArray(xx,&x);
796: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
797: return(0);
798: }
800: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
801: {
802: const MatScalar *v,*diag;
803: PetscScalar *xp,x0,x1,x2,x3,x4;
804: PetscInt nz,k;
805: const PetscInt *vj;
808: for (k=0; k<mbs; k++) {
809: v = aa + 25*ai[k];
810: xp = x + k*5;
811: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
812: nz = ai[k+1] - ai[k];
813: vj = aj + ai[k];
814: xp = x + (*vj)*5;
815: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
816: PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
817: while (nz--) {
818: /* x(:) += U(k,:)^T*(Dk*xk) */
819: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
820: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
821: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
822: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
823: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
824: vj++;
825: xp = x + (*vj)*5;
826: v += 25;
827: }
828: /* xk = inv(Dk)*(Dk*xk) */
829: diag = aa+k*25; /* ptr to inv(Dk) */
830: xp = x + k*5;
831: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
832: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
833: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
834: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
835: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
836: }
837: return(0);
838: }
840: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
841: {
842: const MatScalar *v;
843: PetscScalar *xp,x0,x1,x2,x3,x4;
844: PetscInt nz,k;
845: const PetscInt *vj;
848: for (k=mbs-1; k>=0; k--) {
849: v = aa + 25*ai[k];
850: xp = x + k*5;
851: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
852: nz = ai[k+1] - ai[k];
853: vj = aj + ai[k];
854: xp = x + (*vj)*5;
855: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
856: PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
857: while (nz--) {
858: /* xk += U(k,:)*x(:) */
859: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
860: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
861: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
862: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
863: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
864: vj++;
865: v += 25; xp = x + (*vj)*5;
866: }
867: xp = x + k*5;
868: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
869: }
870: return(0);
871: }
873: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
874: {
875: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
876: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
877: const MatScalar *aa=a->a;
878: PetscScalar *x;
879: const PetscScalar *b;
880: PetscErrorCode ierr;
883: VecGetArrayRead(bb,&b);
884: VecGetArray(xx,&x);
886: /* solve U^T * D * y = b by forward substitution */
887: PetscArraycpy(x,b,5*mbs); /* x <- b */
888: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
890: /* solve U*x = y by back substitution */
891: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
893: VecRestoreArrayRead(bb,&b);
894: VecRestoreArray(xx,&x);
895: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
896: return(0);
897: }
899: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
900: {
901: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
902: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
903: const MatScalar *aa=a->a;
904: PetscScalar *x;
905: const PetscScalar *b;
906: PetscErrorCode ierr;
909: VecGetArrayRead(bb,&b);
910: VecGetArray(xx,&x);
911: PetscArraycpy(x,b,5*mbs); /* x <- b */
912: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
913: VecRestoreArrayRead(bb,&b);
914: VecRestoreArray(xx,&x);
915: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
916: return(0);
917: }
919: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
920: {
921: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
922: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
923: const MatScalar *aa=a->a;
924: PetscScalar *x;
925: const PetscScalar *b;
926: PetscErrorCode ierr;
929: VecGetArrayRead(bb,&b);
930: VecGetArray(xx,&x);
931: PetscArraycpy(x,b,5*mbs);
932: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
933: VecRestoreArrayRead(bb,&b);
934: VecRestoreArray(xx,&x);
935: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
936: return(0);
937: }
939: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
940: {
941: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
942: IS isrow=a->row;
943: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
944: PetscErrorCode ierr;
945: const PetscInt *r,*vj;
946: PetscInt nz,k,idx;
947: const MatScalar *aa=a->a,*v,*diag;
948: PetscScalar *x,x0,x1,x2,x3,*t,*tp;
949: const PetscScalar *b;
952: VecGetArrayRead(bb,&b);
953: VecGetArray(xx,&x);
954: t = a->solve_work;
955: ISGetIndices(isrow,&r);
957: /* solve U^T * D * y = b by forward substitution */
958: tp = t;
959: for (k=0; k<mbs; k++) { /* t <- perm(b) */
960: idx = 4*r[k];
961: tp[0] = b[idx];
962: tp[1] = b[idx+1];
963: tp[2] = b[idx+2];
964: tp[3] = b[idx+3];
965: tp += 4;
966: }
968: for (k=0; k<mbs; k++) {
969: v = aa + 16*ai[k];
970: vj = aj + ai[k];
971: tp = t + k*4;
972: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
973: nz = ai[k+1] - ai[k];
975: tp = t + (*vj)*4;
976: while (nz--) {
977: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
978: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
979: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
980: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
981: vj++;
982: tp = t + (*vj)*4;
983: v += 16;
984: }
986: /* xk = inv(Dk)*(Dk*xk) */
987: diag = aa+k*16; /* ptr to inv(Dk) */
988: tp = t + k*4;
989: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
990: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
991: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
992: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
993: }
995: /* solve U*x = y by back substitution */
996: for (k=mbs-1; k>=0; k--) {
997: v = aa + 16*ai[k];
998: vj = aj + ai[k];
999: tp = t + k*4;
1000: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1001: nz = ai[k+1] - ai[k];
1003: tp = t + (*vj)*4;
1004: while (nz--) {
1005: /* xk += U(k,:)*x(:) */
1006: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1007: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1008: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1009: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1010: vj++;
1011: tp = t + (*vj)*4;
1012: v += 16;
1013: }
1014: tp = t + k*4;
1015: tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1016: idx = 4*r[k];
1017: x[idx] = x0;
1018: x[idx+1] = x1;
1019: x[idx+2] = x2;
1020: x[idx+3] = x3;
1021: }
1023: ISRestoreIndices(isrow,&r);
1024: VecRestoreArrayRead(bb,&b);
1025: VecRestoreArray(xx,&x);
1026: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1027: return(0);
1028: }
1030: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1031: {
1032: const MatScalar *v,*diag;
1033: PetscScalar *xp,x0,x1,x2,x3;
1034: PetscInt nz,k;
1035: const PetscInt *vj;
1038: for (k=0; k<mbs; k++) {
1039: v = aa + 16*ai[k];
1040: xp = x + k*4;
1041: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1042: nz = ai[k+1] - ai[k];
1043: vj = aj + ai[k];
1044: xp = x + (*vj)*4;
1045: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1046: PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1047: while (nz--) {
1048: /* x(:) += U(k,:)^T*(Dk*xk) */
1049: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1050: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1051: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1052: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1053: vj++;
1054: xp = x + (*vj)*4;
1055: v += 16;
1056: }
1057: /* xk = inv(Dk)*(Dk*xk) */
1058: diag = aa+k*16; /* ptr to inv(Dk) */
1059: xp = x + k*4;
1060: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1061: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1062: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1063: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1064: }
1065: return(0);
1066: }
1068: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1069: {
1070: const MatScalar *v;
1071: PetscScalar *xp,x0,x1,x2,x3;
1072: PetscInt nz,k;
1073: const PetscInt *vj;
1076: for (k=mbs-1; k>=0; k--) {
1077: v = aa + 16*ai[k];
1078: xp = x + k*4;
1079: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1080: nz = ai[k+1] - ai[k];
1081: vj = aj + ai[k];
1082: xp = x + (*vj)*4;
1083: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1084: PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1085: while (nz--) {
1086: /* xk += U(k,:)*x(:) */
1087: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1088: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1089: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1090: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1091: vj++;
1092: v += 16; xp = x + (*vj)*4;
1093: }
1094: xp = x + k*4;
1095: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1096: }
1097: return(0);
1098: }
1100: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1101: {
1102: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1103: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1104: const MatScalar *aa=a->a;
1105: PetscScalar *x;
1106: const PetscScalar *b;
1107: PetscErrorCode ierr;
1110: VecGetArrayRead(bb,&b);
1111: VecGetArray(xx,&x);
1113: /* solve U^T * D * y = b by forward substitution */
1114: PetscArraycpy(x,b,4*mbs); /* x <- b */
1115: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1117: /* solve U*x = y by back substitution */
1118: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1119: VecRestoreArrayRead(bb,&b);
1120: VecRestoreArray(xx,&x);
1121: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1122: return(0);
1123: }
1125: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1126: {
1127: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1128: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1129: const MatScalar *aa=a->a;
1130: PetscScalar *x;
1131: const PetscScalar *b;
1132: PetscErrorCode ierr;
1135: VecGetArrayRead(bb,&b);
1136: VecGetArray(xx,&x);
1137: PetscArraycpy(x,b,4*mbs); /* x <- b */
1138: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1139: VecRestoreArrayRead(bb,&b);
1140: VecRestoreArray(xx,&x);
1141: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1142: return(0);
1143: }
1145: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1146: {
1147: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1148: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1149: const MatScalar *aa=a->a;
1150: PetscScalar *x;
1151: const PetscScalar *b;
1152: PetscErrorCode ierr;
1155: VecGetArrayRead(bb,&b);
1156: VecGetArray(xx,&x);
1157: PetscArraycpy(x,b,4*mbs);
1158: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1159: VecRestoreArrayRead(bb,&b);
1160: VecRestoreArray(xx,&x);
1161: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1162: return(0);
1163: }
1165: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1166: {
1167: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1168: IS isrow=a->row;
1169: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1170: PetscErrorCode ierr;
1171: const PetscInt *r;
1172: PetscInt nz,k,idx;
1173: const PetscInt *vj;
1174: const MatScalar *aa=a->a,*v,*diag;
1175: PetscScalar *x,x0,x1,x2,*t,*tp;
1176: const PetscScalar *b;
1179: VecGetArrayRead(bb,&b);
1180: VecGetArray(xx,&x);
1181: t = a->solve_work;
1182: ISGetIndices(isrow,&r);
1184: /* solve U^T * D * y = b by forward substitution */
1185: tp = t;
1186: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1187: idx = 3*r[k];
1188: tp[0] = b[idx];
1189: tp[1] = b[idx+1];
1190: tp[2] = b[idx+2];
1191: tp += 3;
1192: }
1194: for (k=0; k<mbs; k++) {
1195: v = aa + 9*ai[k];
1196: vj = aj + ai[k];
1197: tp = t + k*3;
1198: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1199: nz = ai[k+1] - ai[k];
1201: tp = t + (*vj)*3;
1202: while (nz--) {
1203: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1204: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1205: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1206: vj++;
1207: tp = t + (*vj)*3;
1208: v += 9;
1209: }
1211: /* xk = inv(Dk)*(Dk*xk) */
1212: diag = aa+k*9; /* ptr to inv(Dk) */
1213: tp = t + k*3;
1214: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1215: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1216: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1217: }
1219: /* solve U*x = y by back substitution */
1220: for (k=mbs-1; k>=0; k--) {
1221: v = aa + 9*ai[k];
1222: vj = aj + ai[k];
1223: tp = t + k*3;
1224: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1225: nz = ai[k+1] - ai[k];
1227: tp = t + (*vj)*3;
1228: while (nz--) {
1229: /* xk += U(k,:)*x(:) */
1230: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1231: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1232: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1233: vj++;
1234: tp = t + (*vj)*3;
1235: v += 9;
1236: }
1237: tp = t + k*3;
1238: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1239: idx = 3*r[k];
1240: x[idx] = x0;
1241: x[idx+1] = x1;
1242: x[idx+2] = x2;
1243: }
1245: ISRestoreIndices(isrow,&r);
1246: VecRestoreArrayRead(bb,&b);
1247: VecRestoreArray(xx,&x);
1248: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1249: return(0);
1250: }
1252: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1253: {
1254: const MatScalar *v,*diag;
1255: PetscScalar *xp,x0,x1,x2;
1256: PetscInt nz,k;
1257: const PetscInt *vj;
1260: for (k=0; k<mbs; k++) {
1261: v = aa + 9*ai[k];
1262: xp = x + k*3;
1263: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1264: nz = ai[k+1] - ai[k];
1265: vj = aj + ai[k];
1266: xp = x + (*vj)*3;
1267: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1268: PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1269: while (nz--) {
1270: /* x(:) += U(k,:)^T*(Dk*xk) */
1271: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1272: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1273: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1274: vj++;
1275: xp = x + (*vj)*3;
1276: v += 9;
1277: }
1278: /* xk = inv(Dk)*(Dk*xk) */
1279: diag = aa+k*9; /* ptr to inv(Dk) */
1280: xp = x + k*3;
1281: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1282: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1283: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1284: }
1285: return(0);
1286: }
1288: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1289: {
1290: const MatScalar *v;
1291: PetscScalar *xp,x0,x1,x2;
1292: PetscInt nz,k;
1293: const PetscInt *vj;
1296: for (k=mbs-1; k>=0; k--) {
1297: v = aa + 9*ai[k];
1298: xp = x + k*3;
1299: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1300: nz = ai[k+1] - ai[k];
1301: vj = aj + ai[k];
1302: xp = x + (*vj)*3;
1303: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1304: PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1305: while (nz--) {
1306: /* xk += U(k,:)*x(:) */
1307: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1308: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1309: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1310: vj++;
1311: v += 9; xp = x + (*vj)*3;
1312: }
1313: xp = x + k*3;
1314: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1315: }
1316: return(0);
1317: }
1319: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1320: {
1321: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1322: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1323: const MatScalar *aa=a->a;
1324: PetscScalar *x;
1325: const PetscScalar *b;
1326: PetscErrorCode ierr;
1329: VecGetArrayRead(bb,&b);
1330: VecGetArray(xx,&x);
1332: /* solve U^T * D * y = b by forward substitution */
1333: PetscArraycpy(x,b,3*mbs);
1334: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1336: /* solve U*x = y by back substitution */
1337: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1339: VecRestoreArrayRead(bb,&b);
1340: VecRestoreArray(xx,&x);
1341: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1342: return(0);
1343: }
1345: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1346: {
1347: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1348: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1349: const MatScalar *aa=a->a;
1350: PetscScalar *x;
1351: const PetscScalar *b;
1352: PetscErrorCode ierr;
1355: VecGetArrayRead(bb,&b);
1356: VecGetArray(xx,&x);
1357: PetscArraycpy(x,b,3*mbs);
1358: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1359: VecRestoreArrayRead(bb,&b);
1360: VecRestoreArray(xx,&x);
1361: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1362: return(0);
1363: }
1365: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1366: {
1367: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1368: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1369: const MatScalar *aa=a->a;
1370: PetscScalar *x;
1371: const PetscScalar *b;
1372: PetscErrorCode ierr;
1375: VecGetArrayRead(bb,&b);
1376: VecGetArray(xx,&x);
1377: PetscArraycpy(x,b,3*mbs);
1378: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1379: VecRestoreArrayRead(bb,&b);
1380: VecRestoreArray(xx,&x);
1381: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1382: return(0);
1383: }
1385: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1386: {
1387: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1388: IS isrow=a->row;
1389: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1390: PetscErrorCode ierr;
1391: const PetscInt *r,*vj;
1392: PetscInt nz,k,k2,idx;
1393: const MatScalar *aa=a->a,*v,*diag;
1394: PetscScalar *x,x0,x1,*t;
1395: const PetscScalar *b;
1398: VecGetArrayRead(bb,&b);
1399: VecGetArray(xx,&x);
1400: t = a->solve_work;
1401: ISGetIndices(isrow,&r);
1403: /* solve U^T * D * y = perm(b) by forward substitution */
1404: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1405: idx = 2*r[k];
1406: t[k*2] = b[idx];
1407: t[k*2+1] = b[idx+1];
1408: }
1409: for (k=0; k<mbs; k++) {
1410: v = aa + 4*ai[k];
1411: vj = aj + ai[k];
1412: k2 = k*2;
1413: x0 = t[k2]; x1 = t[k2+1];
1414: nz = ai[k+1] - ai[k];
1415: while (nz--) {
1416: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1417: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1418: vj++; v += 4;
1419: }
1420: diag = aa+k*4; /* ptr to inv(Dk) */
1421: t[k2] = diag[0]*x0 + diag[2]*x1;
1422: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1423: }
1425: /* solve U*x = y by back substitution */
1426: for (k=mbs-1; k>=0; k--) {
1427: v = aa + 4*ai[k];
1428: vj = aj + ai[k];
1429: k2 = k*2;
1430: x0 = t[k2]; x1 = t[k2+1];
1431: nz = ai[k+1] - ai[k];
1432: while (nz--) {
1433: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1434: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1435: vj++;
1436: v += 4;
1437: }
1438: t[k2] = x0;
1439: t[k2+1] = x1;
1440: idx = 2*r[k];
1441: x[idx] = x0;
1442: x[idx+1] = x1;
1443: }
1445: ISRestoreIndices(isrow,&r);
1446: VecRestoreArrayRead(bb,&b);
1447: VecRestoreArray(xx,&x);
1448: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1449: return(0);
1450: }
1452: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1453: {
1454: const MatScalar *v,*diag;
1455: PetscScalar x0,x1;
1456: PetscInt nz,k,k2;
1457: const PetscInt *vj;
1460: for (k=0; k<mbs; k++) {
1461: v = aa + 4*ai[k];
1462: vj = aj + ai[k];
1463: k2 = k*2;
1464: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1465: nz = ai[k+1] - ai[k];
1466: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1467: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1468: while (nz--) {
1469: /* x(:) += U(k,:)^T*(Dk*xk) */
1470: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1471: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1472: vj++; v += 4;
1473: }
1474: /* xk = inv(Dk)*(Dk*xk) */
1475: diag = aa+k*4; /* ptr to inv(Dk) */
1476: x[k2] = diag[0]*x0 + diag[2]*x1;
1477: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1478: }
1479: return(0);
1480: }
1482: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1483: {
1484: const MatScalar *v;
1485: PetscScalar x0,x1;
1486: PetscInt nz,k,k2;
1487: const PetscInt *vj;
1490: for (k=mbs-1; k>=0; k--) {
1491: v = aa + 4*ai[k];
1492: vj = aj + ai[k];
1493: k2 = k*2;
1494: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1495: nz = ai[k+1] - ai[k];
1496: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1497: PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1498: while (nz--) {
1499: /* xk += U(k,:)*x(:) */
1500: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1501: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1502: vj++;
1503: v += 4;
1504: }
1505: x[k2] = x0;
1506: x[k2+1] = x1;
1507: }
1508: return(0);
1509: }
1511: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1512: {
1513: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1514: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1515: const MatScalar *aa=a->a;
1516: PetscScalar *x;
1517: const PetscScalar *b;
1518: PetscErrorCode ierr;
1521: VecGetArrayRead(bb,&b);
1522: VecGetArray(xx,&x);
1524: /* solve U^T * D * y = b by forward substitution */
1525: PetscArraycpy(x,b,2*mbs);
1526: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1528: /* solve U*x = y by back substitution */
1529: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1531: VecRestoreArrayRead(bb,&b);
1532: VecRestoreArray(xx,&x);
1533: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1534: return(0);
1535: }
1537: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1538: {
1539: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1540: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1541: const MatScalar *aa=a->a;
1542: PetscScalar *x;
1543: const PetscScalar *b;
1544: PetscErrorCode ierr;
1547: VecGetArrayRead(bb,&b);
1548: VecGetArray(xx,&x);
1549: PetscArraycpy(x,b,2*mbs);
1550: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1551: VecRestoreArrayRead(bb,&b);
1552: VecRestoreArray(xx,&x);
1553: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1554: return(0);
1555: }
1557: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1558: {
1559: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1560: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1561: const MatScalar *aa=a->a;
1562: PetscScalar *x;
1563: const PetscScalar *b;
1564: PetscErrorCode ierr;
1567: VecGetArrayRead(bb,&b);
1568: VecGetArray(xx,&x);
1569: PetscArraycpy(x,b,2*mbs);
1570: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1571: VecRestoreArrayRead(bb,&b);
1572: VecRestoreArray(xx,&x);
1573: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1574: return(0);
1575: }
1577: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1578: {
1579: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1580: IS isrow=a->row;
1581: PetscErrorCode ierr;
1582: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1583: const MatScalar *aa=a->a,*v;
1584: const PetscScalar *b;
1585: PetscScalar *x,xk,*t;
1586: PetscInt nz,k,j;
1589: VecGetArrayRead(bb,&b);
1590: VecGetArray(xx,&x);
1591: t = a->solve_work;
1592: ISGetIndices(isrow,&rp);
1594: /* solve U^T*D*y = perm(b) by forward substitution */
1595: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1596: for (k=0; k<mbs; k++) {
1597: v = aa + ai[k];
1598: vj = aj + ai[k];
1599: xk = t[k];
1600: nz = ai[k+1] - ai[k] - 1;
1601: for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1602: t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */
1603: }
1605: /* solve U*perm(x) = y by back substitution */
1606: for (k=mbs-1; k>=0; k--) {
1607: v = aa + adiag[k] - 1;
1608: vj = aj + adiag[k] - 1;
1609: nz = ai[k+1] - ai[k] - 1;
1610: for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1611: x[rp[k]] = t[k];
1612: }
1614: ISRestoreIndices(isrow,&rp);
1615: VecRestoreArrayRead(bb,&b);
1616: VecRestoreArray(xx,&x);
1617: PetscLogFlops(4.0*a->nz - 3.0*mbs);
1618: return(0);
1619: }
1621: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1622: {
1623: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1624: IS isrow=a->row;
1625: PetscErrorCode ierr;
1626: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1627: const MatScalar *aa=a->a,*v;
1628: PetscScalar *x,xk,*t;
1629: const PetscScalar *b;
1630: PetscInt nz,k;
1633: VecGetArrayRead(bb,&b);
1634: VecGetArray(xx,&x);
1635: t = a->solve_work;
1636: ISGetIndices(isrow,&rp);
1638: /* solve U^T*D*y = perm(b) by forward substitution */
1639: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1640: for (k=0; k<mbs; k++) {
1641: v = aa + ai[k] + 1;
1642: vj = aj + ai[k] + 1;
1643: xk = t[k];
1644: nz = ai[k+1] - ai[k] - 1;
1645: while (nz--) t[*vj++] += (*v++) * xk;
1646: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1647: }
1649: /* solve U*perm(x) = y by back substitution */
1650: for (k=mbs-1; k>=0; k--) {
1651: v = aa + ai[k] + 1;
1652: vj = aj + ai[k] + 1;
1653: nz = ai[k+1] - ai[k] - 1;
1654: while (nz--) t[k] += (*v++) * t[*vj++];
1655: x[rp[k]] = t[k];
1656: }
1658: ISRestoreIndices(isrow,&rp);
1659: VecRestoreArrayRead(bb,&b);
1660: VecRestoreArray(xx,&x);
1661: PetscLogFlops(4.0*a->nz - 3*mbs);
1662: return(0);
1663: }
1665: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1666: {
1667: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1668: IS isrow=a->row;
1669: PetscErrorCode ierr;
1670: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1671: const MatScalar *aa=a->a,*v;
1672: PetscReal diagk;
1673: PetscScalar *x,xk;
1674: const PetscScalar *b;
1675: PetscInt nz,k;
1678: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1679: VecGetArrayRead(bb,&b);
1680: VecGetArray(xx,&x);
1681: ISGetIndices(isrow,&rp);
1683: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1684: for (k=0; k<mbs; k++) {
1685: v = aa + ai[k];
1686: vj = aj + ai[k];
1687: xk = x[k];
1688: nz = ai[k+1] - ai[k] - 1;
1689: while (nz--) x[*vj++] += (*v++) * xk;
1691: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1692: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1693: x[k] = xk*PetscSqrtReal(diagk);
1694: }
1695: ISRestoreIndices(isrow,&rp);
1696: VecRestoreArrayRead(bb,&b);
1697: VecRestoreArray(xx,&x);
1698: PetscLogFlops(2.0*a->nz - mbs);
1699: return(0);
1700: }
1702: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1703: {
1704: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1705: IS isrow=a->row;
1706: PetscErrorCode ierr;
1707: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1708: const MatScalar *aa=a->a,*v;
1709: PetscReal diagk;
1710: PetscScalar *x,xk;
1711: const PetscScalar *b;
1712: PetscInt nz,k;
1715: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1716: VecGetArrayRead(bb,&b);
1717: VecGetArray(xx,&x);
1718: ISGetIndices(isrow,&rp);
1720: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1721: for (k=0; k<mbs; k++) {
1722: v = aa + ai[k] + 1;
1723: vj = aj + ai[k] + 1;
1724: xk = x[k];
1725: nz = ai[k+1] - ai[k] - 1;
1726: while (nz--) x[*vj++] += (*v++) * xk;
1728: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1729: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1730: x[k] = xk*PetscSqrtReal(diagk);
1731: }
1732: ISRestoreIndices(isrow,&rp);
1733: VecRestoreArrayRead(bb,&b);
1734: VecRestoreArray(xx,&x);
1735: PetscLogFlops(2.0*a->nz - mbs);
1736: return(0);
1737: }
1739: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1740: {
1741: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1742: IS isrow=a->row;
1743: PetscErrorCode ierr;
1744: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1745: const MatScalar *aa=a->a,*v;
1746: PetscReal diagk;
1747: PetscScalar *x,*t;
1748: const PetscScalar *b;
1749: PetscInt nz,k;
1752: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1753: VecGetArrayRead(bb,&b);
1754: VecGetArray(xx,&x);
1755: t = a->solve_work;
1756: ISGetIndices(isrow,&rp);
1758: for (k=mbs-1; k>=0; k--) {
1759: v = aa + ai[k];
1760: vj = aj + ai[k];
1761: diagk = PetscRealPart(aa[adiag[k]]);
1762: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1763: t[k] = b[k] * PetscSqrtReal(diagk);
1764: nz = ai[k+1] - ai[k] - 1;
1765: while (nz--) t[k] += (*v++) * t[*vj++];
1766: x[rp[k]] = t[k];
1767: }
1768: ISRestoreIndices(isrow,&rp);
1769: VecRestoreArrayRead(bb,&b);
1770: VecRestoreArray(xx,&x);
1771: PetscLogFlops(2.0*a->nz - mbs);
1772: return(0);
1773: }
1775: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1776: {
1777: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1778: IS isrow=a->row;
1779: PetscErrorCode ierr;
1780: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1781: const MatScalar *aa=a->a,*v;
1782: PetscReal diagk;
1783: PetscScalar *x,*t;
1784: const PetscScalar *b;
1785: PetscInt nz,k;
1788: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1789: VecGetArrayRead(bb,&b);
1790: VecGetArray(xx,&x);
1791: t = a->solve_work;
1792: ISGetIndices(isrow,&rp);
1794: for (k=mbs-1; k>=0; k--) {
1795: v = aa + ai[k] + 1;
1796: vj = aj + ai[k] + 1;
1797: diagk = PetscRealPart(aa[ai[k]]);
1798: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1799: t[k] = b[k] * PetscSqrtReal(diagk);
1800: nz = ai[k+1] - ai[k] - 1;
1801: while (nz--) t[k] += (*v++) * t[*vj++];
1802: x[rp[k]] = t[k];
1803: }
1804: ISRestoreIndices(isrow,&rp);
1805: VecRestoreArrayRead(bb,&b);
1806: VecRestoreArray(xx,&x);
1807: PetscLogFlops(2.0*a->nz - mbs);
1808: return(0);
1809: }
1811: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1812: {
1813: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1817: if (A->rmap->bs == 1) {
1818: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1819: } else {
1820: IS isrow=a->row;
1821: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1822: const MatScalar *aa=a->a,*v;
1823: PetscScalar *x,*t;
1824: const PetscScalar *b;
1825: PetscInt nz,k,n,i,j;
1827: if (bb->n > a->solves_work_n) {
1828: PetscFree(a->solves_work);
1829: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1830: a->solves_work_n = bb->n;
1831: }
1832: n = bb->n;
1833: VecGetArrayRead(bb->v,&b);
1834: VecGetArray(xx->v,&x);
1835: t = a->solves_work;
1837: ISGetIndices(isrow,&rp);
1839: /* solve U^T*D*y = perm(b) by forward substitution */
1840: for (k=0; k<mbs; k++) {
1841: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1842: }
1843: for (k=0; k<mbs; k++) {
1844: v = aa + ai[k];
1845: vj = aj + ai[k];
1846: nz = ai[k+1] - ai[k] - 1;
1847: for (j=0; j<nz; j++) {
1848: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1849: v++;vj++;
1850: }
1851: for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1852: }
1854: /* solve U*perm(x) = y by back substitution */
1855: for (k=mbs-1; k>=0; k--) {
1856: v = aa + ai[k] - 1;
1857: vj = aj + ai[k] - 1;
1858: nz = ai[k+1] - ai[k] - 1;
1859: for (j=0; j<nz; j++) {
1860: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1861: v++;vj++;
1862: }
1863: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1864: }
1866: ISRestoreIndices(isrow,&rp);
1867: VecRestoreArrayRead(bb->v,&b);
1868: VecRestoreArray(xx->v,&x);
1869: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1870: }
1871: return(0);
1872: }
1874: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1875: {
1876: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1880: if (A->rmap->bs == 1) {
1881: MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1882: } else {
1883: IS isrow=a->row;
1884: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1885: const MatScalar *aa=a->a,*v;
1886: PetscScalar *x,*t;
1887: const PetscScalar *b;
1888: PetscInt nz,k,n,i;
1890: if (bb->n > a->solves_work_n) {
1891: PetscFree(a->solves_work);
1892: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1893: a->solves_work_n = bb->n;
1894: }
1895: n = bb->n;
1896: VecGetArrayRead(bb->v,&b);
1897: VecGetArray(xx->v,&x);
1898: t = a->solves_work;
1900: ISGetIndices(isrow,&rp);
1902: /* solve U^T*D*y = perm(b) by forward substitution */
1903: for (k=0; k<mbs; k++) {
1904: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1905: }
1906: for (k=0; k<mbs; k++) {
1907: v = aa + ai[k];
1908: vj = aj + ai[k];
1909: nz = ai[k+1] - ai[k];
1910: while (nz--) {
1911: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1912: v++;vj++;
1913: }
1914: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1915: }
1917: /* solve U*perm(x) = y by back substitution */
1918: for (k=mbs-1; k>=0; k--) {
1919: v = aa + ai[k];
1920: vj = aj + ai[k];
1921: nz = ai[k+1] - ai[k];
1922: while (nz--) {
1923: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1924: v++;vj++;
1925: }
1926: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1927: }
1929: ISRestoreIndices(isrow,&rp);
1930: VecRestoreArrayRead(bb->v,&b);
1931: VecRestoreArray(xx->v,&x);
1932: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1933: }
1934: return(0);
1935: }
1937: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1938: {
1939: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1940: PetscErrorCode ierr;
1941: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1942: const MatScalar *aa=a->a,*v;
1943: const PetscScalar *b;
1944: PetscScalar *x,xi;
1945: PetscInt nz,i,j;
1948: VecGetArrayRead(bb,&b);
1949: VecGetArray(xx,&x);
1950: /* solve U^T*D*y = b by forward substitution */
1951: PetscArraycpy(x,b,mbs);
1952: for (i=0; i<mbs; i++) {
1953: v = aa + ai[i];
1954: vj = aj + ai[i];
1955: xi = x[i];
1956: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1957: for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1958: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1959: }
1960: /* solve U*x = y by backward substitution */
1961: for (i=mbs-2; i>=0; i--) {
1962: xi = x[i];
1963: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
1964: vj = aj + adiag[i] - 1;
1965: nz = ai[i+1] - ai[i] - 1;
1966: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1967: x[i] = xi;
1968: }
1969: VecRestoreArrayRead(bb,&b);
1970: VecRestoreArray(xx,&x);
1971: PetscLogFlops(4.0*a->nz - 3*mbs);
1972: return(0);
1973: }
1975: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1976: {
1977: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1978: PetscErrorCode ierr;
1979: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1980: const MatScalar *aa=a->a,*v;
1981: const PetscScalar *b;
1982: PetscScalar *x,xi;
1983: PetscInt nz,i,j,neq,ldb,ldx;
1984: PetscBool isdense;
1987: if (!mbs) return(0);
1988: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);
1989: if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1990: if (X != B) {
1991: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);
1992: if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1993: }
1994: MatDenseGetArrayRead(B,&b);
1995: MatDenseGetLDA(B,&ldb);
1996: MatDenseGetArray(X,&x);
1997: MatDenseGetLDA(X,&ldx);
1998: for (neq=0; neq<B->cmap->n; neq++) {
1999: /* solve U^T*D*y = b by forward substitution */
2000: PetscArraycpy(x,b,mbs);
2001: for (i=0; i<mbs; i++) {
2002: v = aa + ai[i];
2003: vj = aj + ai[i];
2004: xi = x[i];
2005: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2006: for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
2007: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2008: }
2009: /* solve U*x = y by backward substitution */
2010: for (i=mbs-2; i>=0; i--) {
2011: xi = x[i];
2012: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2013: vj = aj + adiag[i] - 1;
2014: nz = ai[i+1] - ai[i] - 1;
2015: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2016: x[i] = xi;
2017: }
2018: b += ldb;
2019: x += ldx;
2020: }
2021: MatDenseRestoreArrayRead(B,&b);
2022: MatDenseRestoreArray(X,&x);
2023: PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));
2024: return(0);
2025: }
2027: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2028: {
2029: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2030: PetscErrorCode ierr;
2031: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2032: const MatScalar *aa=a->a,*v;
2033: PetscScalar *x,xk;
2034: const PetscScalar *b;
2035: PetscInt nz,k;
2038: VecGetArrayRead(bb,&b);
2039: VecGetArray(xx,&x);
2041: /* solve U^T*D*y = b by forward substitution */
2042: PetscArraycpy(x,b,mbs);
2043: for (k=0; k<mbs; k++) {
2044: v = aa + ai[k] + 1;
2045: vj = aj + ai[k] + 1;
2046: xk = x[k];
2047: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2048: while (nz--) x[*vj++] += (*v++) * xk;
2049: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2050: }
2052: /* solve U*x = y by back substitution */
2053: for (k=mbs-2; k>=0; k--) {
2054: v = aa + ai[k] + 1;
2055: vj = aj + ai[k] + 1;
2056: xk = x[k];
2057: nz = ai[k+1] - ai[k] - 1;
2058: while (nz--) {
2059: xk += (*v++) * x[*vj++];
2060: }
2061: x[k] = xk;
2062: }
2064: VecRestoreArrayRead(bb,&b);
2065: VecRestoreArray(xx,&x);
2066: PetscLogFlops(4.0*a->nz - 3*mbs);
2067: return(0);
2068: }
2070: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2071: {
2072: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2073: PetscErrorCode ierr;
2074: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2075: const MatScalar *aa=a->a,*v;
2076: PetscReal diagk;
2077: PetscScalar *x;
2078: const PetscScalar *b;
2079: PetscInt nz,k;
2082: /* solve U^T*D^(1/2)*x = b by forward substitution */
2083: VecGetArrayRead(bb,&b);
2084: VecGetArray(xx,&x);
2085: PetscArraycpy(x,b,mbs);
2086: for (k=0; k<mbs; k++) {
2087: v = aa + ai[k];
2088: vj = aj + ai[k];
2089: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2090: while (nz--) x[*vj++] += (*v++) * x[k];
2091: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2092: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2093: x[k] *= PetscSqrtReal(diagk);
2094: }
2095: VecRestoreArrayRead(bb,&b);
2096: VecRestoreArray(xx,&x);
2097: PetscLogFlops(2.0*a->nz - mbs);
2098: return(0);
2099: }
2101: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2102: {
2103: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2104: PetscErrorCode ierr;
2105: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2106: const MatScalar *aa=a->a,*v;
2107: PetscReal diagk;
2108: PetscScalar *x;
2109: const PetscScalar *b;
2110: PetscInt nz,k;
2113: /* solve U^T*D^(1/2)*x = b by forward substitution */
2114: VecGetArrayRead(bb,&b);
2115: VecGetArray(xx,&x);
2116: PetscArraycpy(x,b,mbs);
2117: for (k=0; k<mbs; k++) {
2118: v = aa + ai[k] + 1;
2119: vj = aj + ai[k] + 1;
2120: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2121: while (nz--) x[*vj++] += (*v++) * x[k];
2122: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2123: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2124: x[k] *= PetscSqrtReal(diagk);
2125: }
2126: VecRestoreArrayRead(bb,&b);
2127: VecRestoreArray(xx,&x);
2128: PetscLogFlops(2.0*a->nz - mbs);
2129: return(0);
2130: }
2132: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2133: {
2134: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2135: PetscErrorCode ierr;
2136: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2137: const MatScalar *aa=a->a,*v;
2138: PetscReal diagk;
2139: PetscScalar *x;
2140: const PetscScalar *b;
2141: PetscInt nz,k;
2144: /* solve D^(1/2)*U*x = b by back substitution */
2145: VecGetArrayRead(bb,&b);
2146: VecGetArray(xx,&x);
2148: for (k=mbs-1; k>=0; k--) {
2149: v = aa + ai[k];
2150: vj = aj + ai[k];
2151: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2152: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2153: x[k] = PetscSqrtReal(diagk)*b[k];
2154: nz = ai[k+1] - ai[k] - 1;
2155: while (nz--) x[k] += (*v++) * x[*vj++];
2156: }
2157: VecRestoreArrayRead(bb,&b);
2158: VecRestoreArray(xx,&x);
2159: PetscLogFlops(2.0*a->nz - mbs);
2160: return(0);
2161: }
2163: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2164: {
2165: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2166: PetscErrorCode ierr;
2167: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2168: const MatScalar *aa=a->a,*v;
2169: PetscReal diagk;
2170: PetscScalar *x;
2171: const PetscScalar *b;
2172: PetscInt nz,k;
2175: /* solve D^(1/2)*U*x = b by back substitution */
2176: VecGetArrayRead(bb,&b);
2177: VecGetArray(xx,&x);
2179: for (k=mbs-1; k>=0; k--) {
2180: v = aa + ai[k] + 1;
2181: vj = aj + ai[k] + 1;
2182: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2183: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2184: x[k] = PetscSqrtReal(diagk)*b[k];
2185: nz = ai[k+1] - ai[k] - 1;
2186: while (nz--) x[k] += (*v++) * x[*vj++];
2187: }
2188: VecRestoreArrayRead(bb,&b);
2189: VecRestoreArray(xx,&x);
2190: PetscLogFlops(2.0*a->nz - mbs);
2191: return(0);
2192: }
2194: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2195: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2196: {
2197: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2199: const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2200: PetscInt *jutmp,bs = A->rmap->bs,i;
2201: PetscInt m,reallocs = 0,*levtmp;
2202: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2203: PetscInt incrlev,*lev,shift,prow,nz;
2204: PetscReal f = info->fill,levels = info->levels;
2205: PetscBool perm_identity;
2208: /* check whether perm is the identity mapping */
2209: ISIdentity(perm,&perm_identity);
2211: if (perm_identity) {
2212: a->permute = PETSC_FALSE;
2213: ai = a->i; aj = a->j;
2214: } else { /* non-trivial permutation */
2215: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2216: }
2218: /* initialization */
2219: ISGetIndices(perm,&rip);
2220: umax = (PetscInt)(f*ai[mbs] + 1);
2221: PetscMalloc1(umax,&lev);
2222: umax += mbs + 1;
2223: shift = mbs + 1;
2224: PetscMalloc1(mbs+1,&iu);
2225: PetscMalloc1(umax,&ju);
2226: iu[0] = mbs + 1;
2227: juidx = mbs + 1;
2228: /* prowl: linked list for pivot row */
2229: PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2230: /* q: linked list for col index */
2232: for (i=0; i<mbs; i++) {
2233: prowl[i] = mbs;
2234: q[i] = 0;
2235: }
2237: /* for each row k */
2238: for (k=0; k<mbs; k++) {
2239: nzk = 0;
2240: q[k] = mbs;
2241: /* copy current row into linked list */
2242: nz = ai[rip[k]+1] - ai[rip[k]];
2243: j = ai[rip[k]];
2244: while (nz--) {
2245: vj = rip[aj[j++]];
2246: if (vj > k) {
2247: qm = k;
2248: do {
2249: m = qm; qm = q[m];
2250: } while (qm < vj);
2251: if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2252: nzk++;
2253: q[m] = vj;
2254: q[vj] = qm;
2255: levtmp[vj] = 0; /* initialize lev for nonzero element */
2256: }
2257: }
2259: /* modify nonzero structure of k-th row by computing fill-in
2260: for each row prow to be merged in */
2261: prow = k;
2262: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2264: while (prow < k) {
2265: /* merge row prow into k-th row */
2266: jmin = iu[prow] + 1;
2267: jmax = iu[prow+1];
2268: qm = k;
2269: for (j=jmin; j<jmax; j++) {
2270: incrlev = lev[j-shift] + 1;
2271: if (incrlev > levels) continue;
2272: vj = ju[j];
2273: do {
2274: m = qm; qm = q[m];
2275: } while (qm < vj);
2276: if (qm != vj) { /* a fill */
2277: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2278: levtmp[vj] = incrlev;
2279: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2280: }
2281: prow = prowl[prow]; /* next pivot row */
2282: }
2284: /* add k to row list for first nonzero element in k-th row */
2285: if (nzk > 1) {
2286: i = q[k]; /* col value of first nonzero element in k_th row of U */
2287: prowl[k] = prowl[i]; prowl[i] = k;
2288: }
2289: iu[k+1] = iu[k] + nzk;
2291: /* allocate more space to ju and lev if needed */
2292: if (iu[k+1] > umax) {
2293: /* estimate how much additional space we will need */
2294: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2295: /* just double the memory each time */
2296: maxadd = umax;
2297: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2298: umax += maxadd;
2300: /* allocate a longer ju */
2301: PetscMalloc1(umax,&jutmp);
2302: PetscArraycpy(jutmp,ju,iu[k]);
2303: PetscFree(ju);
2304: ju = jutmp;
2306: PetscMalloc1(umax,&jutmp);
2307: PetscArraycpy(jutmp,lev,iu[k]-shift);
2308: PetscFree(lev);
2309: lev = jutmp;
2310: reallocs += 2; /* count how many times we realloc */
2311: }
2313: /* save nonzero structure of k-th row in ju */
2314: i=k;
2315: while (nzk--) {
2316: i = q[i];
2317: ju[juidx] = i;
2318: lev[juidx-shift] = levtmp[i];
2319: juidx++;
2320: }
2321: }
2323: #if defined(PETSC_USE_INFO)
2324: if (ai[mbs] != 0) {
2325: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2326: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2327: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2328: PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2329: PetscInfo(A,"for best performance.\n");
2330: } else {
2331: PetscInfo(A,"Empty matrix\n");
2332: }
2333: #endif
2335: ISRestoreIndices(perm,&rip);
2336: PetscFree3(prowl,q,levtmp);
2337: PetscFree(lev);
2339: /* put together the new matrix */
2340: MatSeqSBAIJSetPreallocation(B,bs,0,NULL);
2342: /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2343: b = (Mat_SeqSBAIJ*)(B)->data;
2344: PetscFree2(b->imax,b->ilen);
2346: b->singlemalloc = PETSC_FALSE;
2347: b->free_a = PETSC_TRUE;
2348: b->free_ij = PETSC_TRUE;
2349: /* the next line frees the default space generated by the Create() */
2350: PetscFree3(b->a,b->j,b->i);
2351: PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2352: b->j = ju;
2353: b->i = iu;
2354: b->diag = NULL;
2355: b->ilen = NULL;
2356: b->imax = NULL;
2358: ISDestroy(&b->row);
2359: ISDestroy(&b->icol);
2360: b->row = perm;
2361: b->icol = perm;
2362: PetscObjectReference((PetscObject)perm);
2363: PetscObjectReference((PetscObject)perm);
2364: PetscMalloc1(bs*mbs+bs,&b->solve_work);
2365: /* In b structure: Free imax, ilen, old a, old j.
2366: Allocate idnew, solve_work, new a, new j */
2367: PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2368: b->maxnz = b->nz = iu[mbs];
2370: (B)->info.factor_mallocs = reallocs;
2371: (B)->info.fill_ratio_given = f;
2372: if (ai[mbs] != 0) {
2373: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2374: } else {
2375: (B)->info.fill_ratio_needed = 0.0;
2376: }
2377: MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2378: return(0);
2379: }
2381: /*
2382: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2383: */
2384: #include <petscbt.h>
2385: #include <../src/mat/utils/freespace.h>
2386: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2387: {
2388: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2389: PetscErrorCode ierr;
2390: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2391: PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2392: const PetscInt *rip;
2393: PetscInt reallocs=0,i,*ui,*udiag,*cols;
2394: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2395: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2396: PetscReal fill =info->fill,levels=info->levels;
2397: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2398: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2399: PetscBT lnkbt;
2402: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2403: MatMissingDiagonal(A,&missing,&d);
2404: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2405: if (bs > 1) {
2406: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2407: return(0);
2408: }
2410: /* check whether perm is the identity mapping */
2411: ISIdentity(perm,&perm_identity);
2412: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2413: a->permute = PETSC_FALSE;
2415: PetscMalloc1(am+1,&ui);
2416: PetscMalloc1(am+1,&udiag);
2417: ui[0] = 0;
2419: /* ICC(0) without matrix ordering: simply rearrange column indices */
2420: if (!levels) {
2421: /* reuse the column pointers and row offsets for memory savings */
2422: for (i=0; i<am; i++) {
2423: ncols = ai[i+1] - ai[i];
2424: ui[i+1] = ui[i] + ncols;
2425: udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2426: }
2427: PetscMalloc1(ui[am]+1,&uj);
2428: cols = uj;
2429: for (i=0; i<am; i++) {
2430: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2431: ncols = ai[i+1] - ai[i] -1;
2432: for (j=0; j<ncols; j++) *cols++ = aj[j];
2433: *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2434: }
2435: } else { /* case: levels>0 */
2436: ISGetIndices(perm,&rip);
2438: /* initialization */
2439: /* jl: linked list for storing indices of the pivot rows
2440: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2441: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2442: for (i=0; i<am; i++) {
2443: jl[i] = am; il[i] = 0;
2444: }
2446: /* create and initialize a linked list for storing column indices of the active row k */
2447: nlnk = am + 1;
2448: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2450: /* initial FreeSpace size is fill*(ai[am]+1) */
2451: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2453: current_space = free_space;
2455: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2457: current_space_lvl = free_space_lvl;
2459: for (k=0; k<am; k++) { /* for each active row k */
2460: /* initialize lnk by the column indices of row k */
2461: nzk = 0;
2462: ncols = ai[k+1] - ai[k];
2463: if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2464: cols = aj+ai[k];
2465: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2466: nzk += nlnk;
2468: /* update lnk by computing fill-in for each pivot row to be merged in */
2469: prow = jl[k]; /* 1st pivot row */
2471: while (prow < k) {
2472: nextprow = jl[prow];
2474: /* merge prow into k-th row */
2475: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2476: jmax = ui[prow+1];
2477: ncols = jmax-jmin;
2478: i = jmin - ui[prow];
2480: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2481: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2482: j = *(uj - 1);
2483: PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2484: nzk += nlnk;
2486: /* update il and jl for prow */
2487: if (jmin < jmax) {
2488: il[prow] = jmin;
2490: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2491: }
2492: prow = nextprow;
2493: }
2495: /* if free space is not available, make more free space */
2496: if (current_space->local_remaining<nzk) {
2497: i = am - k + 1; /* num of unfactored rows */
2498: i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2499: PetscFreeSpaceGet(i,¤t_space);
2500: PetscFreeSpaceGet(i,¤t_space_lvl);
2501: reallocs++;
2502: }
2504: /* copy data into free_space and free_space_lvl, then initialize lnk */
2505: if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2506: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2508: /* add the k-th row into il and jl */
2509: if (nzk > 1) {
2510: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2511: jl[k] = jl[i]; jl[i] = k;
2512: il[k] = ui[k] + 1;
2513: }
2514: uj_ptr[k] = current_space->array;
2515: uj_lvl_ptr[k] = current_space_lvl->array;
2517: current_space->array += nzk;
2518: current_space->local_used += nzk;
2519: current_space->local_remaining -= nzk;
2520: current_space_lvl->array += nzk;
2521: current_space_lvl->local_used += nzk;
2522: current_space_lvl->local_remaining -= nzk;
2524: ui[k+1] = ui[k] + nzk;
2525: }
2527: ISRestoreIndices(perm,&rip);
2528: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2530: /* destroy list of free space and other temporary array(s) */
2531: PetscMalloc1(ui[am]+1,&uj);
2532: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor */
2533: PetscIncompleteLLDestroy(lnk,lnkbt);
2534: PetscFreeSpaceDestroy(free_space_lvl);
2536: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2538: /* put together the new matrix in MATSEQSBAIJ format */
2539: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2541: b = (Mat_SeqSBAIJ*)(fact)->data;
2542: PetscFree2(b->imax,b->ilen);
2544: b->singlemalloc = PETSC_FALSE;
2545: b->free_a = PETSC_TRUE;
2546: b->free_ij = free_ij;
2548: PetscMalloc1(ui[am]+1,&b->a);
2550: b->j = uj;
2551: b->i = ui;
2552: b->diag = udiag;
2553: b->free_diag = PETSC_TRUE;
2554: b->ilen = NULL;
2555: b->imax = NULL;
2556: b->row = perm;
2557: b->col = perm;
2559: PetscObjectReference((PetscObject)perm);
2560: PetscObjectReference((PetscObject)perm);
2562: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2564: PetscMalloc1(am+1,&b->solve_work);
2565: PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2567: b->maxnz = b->nz = ui[am];
2569: fact->info.factor_mallocs = reallocs;
2570: fact->info.fill_ratio_given = fill;
2571: if (ai[am] != 0) {
2572: fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2573: } else {
2574: fact->info.fill_ratio_needed = 0.0;
2575: }
2576: #if defined(PETSC_USE_INFO)
2577: if (ai[am] != 0) {
2578: PetscReal af = fact->info.fill_ratio_needed;
2579: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2580: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2581: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2582: } else {
2583: PetscInfo(A,"Empty matrix\n");
2584: }
2585: #endif
2586: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2587: return(0);
2588: }
2590: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2591: {
2592: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2593: Mat_SeqSBAIJ *b;
2594: PetscErrorCode ierr;
2595: PetscBool perm_identity,free_ij = PETSC_TRUE;
2596: PetscInt bs=A->rmap->bs,am=a->mbs;
2597: const PetscInt *cols,*rip,*ai=a->i,*aj=a->j;
2598: PetscInt reallocs=0,i,*ui;
2599: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2600: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2601: PetscReal fill =info->fill,levels=info->levels,ratio_needed;
2602: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2603: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2604: PetscBT lnkbt;
2607: /*
2608: This code originally uses Modified Sparse Row (MSR) storage
2609: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2610: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2611: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2612: thus the original code in MSR format is still used for these cases.
2613: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2614: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2615: */
2616: if (bs > 1) {
2617: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2618: return(0);
2619: }
2621: /* check whether perm is the identity mapping */
2622: ISIdentity(perm,&perm_identity);
2623: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2624: a->permute = PETSC_FALSE;
2626: /* special case that simply copies fill pattern */
2627: if (!levels) {
2628: /* reuse the column pointers and row offsets for memory savings */
2629: ui = a->i;
2630: uj = a->j;
2631: free_ij = PETSC_FALSE;
2632: ratio_needed = 1.0;
2633: } else { /* case: levels>0 */
2634: ISGetIndices(perm,&rip);
2636: /* initialization */
2637: PetscMalloc1(am+1,&ui);
2638: ui[0] = 0;
2640: /* jl: linked list for storing indices of the pivot rows
2641: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2642: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2643: for (i=0; i<am; i++) {
2644: jl[i] = am; il[i] = 0;
2645: }
2647: /* create and initialize a linked list for storing column indices of the active row k */
2648: nlnk = am + 1;
2649: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2651: /* initial FreeSpace size is fill*(ai[am]+1) */
2652: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2654: current_space = free_space;
2656: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2658: current_space_lvl = free_space_lvl;
2660: for (k=0; k<am; k++) { /* for each active row k */
2661: /* initialize lnk by the column indices of row rip[k] */
2662: nzk = 0;
2663: ncols = ai[rip[k]+1] - ai[rip[k]];
2664: cols = aj+ai[rip[k]];
2665: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2666: nzk += nlnk;
2668: /* update lnk by computing fill-in for each pivot row to be merged in */
2669: prow = jl[k]; /* 1st pivot row */
2671: while (prow < k) {
2672: nextprow = jl[prow];
2674: /* merge prow into k-th row */
2675: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2676: jmax = ui[prow+1];
2677: ncols = jmax-jmin;
2678: i = jmin - ui[prow];
2679: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2680: j = *(uj_lvl_ptr[prow] + i - 1);
2681: cols_lvl = uj_lvl_ptr[prow]+i;
2682: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2683: nzk += nlnk;
2685: /* update il and jl for prow */
2686: if (jmin < jmax) {
2687: il[prow] = jmin;
2688: j = *cols;
2689: jl[prow] = jl[j];
2690: jl[j] = prow;
2691: }
2692: prow = nextprow;
2693: }
2695: /* if free space is not available, make more free space */
2696: if (current_space->local_remaining<nzk) {
2697: i = am - k + 1; /* num of unfactored rows */
2698: i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2699: PetscFreeSpaceGet(i,¤t_space);
2700: PetscFreeSpaceGet(i,¤t_space_lvl);
2701: reallocs++;
2702: }
2704: /* copy data into free_space and free_space_lvl, then initialize lnk */
2705: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2707: /* add the k-th row into il and jl */
2708: if (nzk-1 > 0) {
2709: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2710: jl[k] = jl[i]; jl[i] = k;
2711: il[k] = ui[k] + 1;
2712: }
2713: uj_ptr[k] = current_space->array;
2714: uj_lvl_ptr[k] = current_space_lvl->array;
2716: current_space->array += nzk;
2717: current_space->local_used += nzk;
2718: current_space->local_remaining -= nzk;
2719: current_space_lvl->array += nzk;
2720: current_space_lvl->local_used += nzk;
2721: current_space_lvl->local_remaining -= nzk;
2723: ui[k+1] = ui[k] + nzk;
2724: }
2726: ISRestoreIndices(perm,&rip);
2727: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2729: /* destroy list of free space and other temporary array(s) */
2730: PetscMalloc1(ui[am]+1,&uj);
2731: PetscFreeSpaceContiguous(&free_space,uj);
2732: PetscIncompleteLLDestroy(lnk,lnkbt);
2733: PetscFreeSpaceDestroy(free_space_lvl);
2734: if (ai[am] != 0) {
2735: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2736: } else {
2737: ratio_needed = 0.0;
2738: }
2739: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2741: /* put together the new matrix in MATSEQSBAIJ format */
2742: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2744: b = (Mat_SeqSBAIJ*)(fact)->data;
2746: PetscFree2(b->imax,b->ilen);
2748: b->singlemalloc = PETSC_FALSE;
2749: b->free_a = PETSC_TRUE;
2750: b->free_ij = free_ij;
2752: PetscMalloc1(ui[am]+1,&b->a);
2754: b->j = uj;
2755: b->i = ui;
2756: b->diag = NULL;
2757: b->ilen = NULL;
2758: b->imax = NULL;
2759: b->row = perm;
2760: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2762: PetscObjectReference((PetscObject)perm);
2763: b->icol = perm;
2764: PetscObjectReference((PetscObject)perm);
2765: PetscMalloc1(am+1,&b->solve_work);
2767: b->maxnz = b->nz = ui[am];
2769: fact->info.factor_mallocs = reallocs;
2770: fact->info.fill_ratio_given = fill;
2771: fact->info.fill_ratio_needed = ratio_needed;
2772: #if defined(PETSC_USE_INFO)
2773: if (ai[am] != 0) {
2774: PetscReal af = fact->info.fill_ratio_needed;
2775: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2776: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2777: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2778: } else {
2779: PetscInfo(A,"Empty matrix\n");
2780: }
2781: #endif
2782: if (perm_identity) {
2783: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2784: } else {
2785: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2786: }
2787: return(0);
2788: }