Actual source code: aijcusparseband.cu

  1: /*
  2:   AIJCUSPARSE methods implemented with Cuda kernels. Uses cuSparse/Thrust maps from AIJCUSPARSE
  3: */
  4: #define PETSC_SKIP_SPINLOCK
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #undef VecType
 11: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 12: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 13: #include <cooperative_groups.h>
 14: #endif

 16: #define CHECK_LAUNCH_ERROR()                                                             \
 17: do {                                                                                     \
 18:   /* Check synchronous errors, i.e. pre-launch */                                        \
 19:   cudaError_t err = cudaGetLastError();                                                  \
 20:   if (cudaSuccess != err) {                                                              \
 21:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 22:   }                                                                                      \
 23:   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
 24:   err = cudaDeviceSynchronize();                                                         \
 25:   if (cudaSuccess != err) {                                                              \
 26:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 27:   }                                                                                      \
 28:  } while (0)

 30: /*
 31:   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)

 33:   requires:
 34:      structurally symmetric: fix with transpose/column meta data
 35: */

 37: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
 38: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);

 40: /*
 41:   The GPU LU factor kernel
 42: */
 43: __global__
 44: void __launch_bounds__(1024,1)
 45: mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
 46: {
 47:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
 48:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
 49:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

 51:   // set i (row+1)
 52:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
 53:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 54:     if (rowb < end_i && threadIdx.x==0) {
 55:       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
 56:       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
 57:     }
 58:   }
 59: }
 60: // copy AIJ to AIJ_BAND
 61: __global__
 62: void __launch_bounds__(1024,1)
 63: mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
 64:                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
 65:                                 const int bi_csr[], PetscScalar ba_csr[])
 66: {
 67:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
 68:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
 69:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

 71:   // zero B
 72:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
 73:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 74:     if (rowb < end_i) {
 75:       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
 76:       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
 77:       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
 78:         if (j<nzb) {
 79:           batmp[j] = 0;
 80:         }
 81:       }
 82:     }
 83:   }

 85:   // copy A into B with CSR format -- these two loops can be fused
 86:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 87:     if (rowb < end_i) {
 88:       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
 89:       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
 90:       const PetscScalar *av    = aa_d + ai_d[rowa];
 91:       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
 92:       /* load in initial (unfactored row) */
 93:       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
 94:         if (j<nza) {
 95:           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
 96:           PetscScalar vala = av[j];
 97:           batmp[idx] = vala;
 98:         }
 99:       }
100:     }
101:   }
102: }
103: // print AIJ_BAND
104: __global__
105: void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
106: {
107:   // debug
108:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) {
109:     printf("B (AIJ) n=%d:\n",(int)n);
110:     for (int rowb=0;rowb<n;rowb++) {
111:       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
112:       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
113:       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
114:       printf(" bi=%d\n",bi_csr[rowb+1]);
115:     }
116:   }
117: }
118: // Band LU kernel ---  ba_csr bi_csr
119: __global__
120: void __launch_bounds__(1024,1)
121:   mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[], int *use_group_sync)
122: {
123:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
124:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
125:   const PetscInt  start = field*nloc, end = start + nloc;
126: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
127:   auto g = cooperative_groups::this_grid();
128: #endif
129:   // A22 panel update for each row A(1,:) and col A(:,1)
130:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
131:     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
132:     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
133:     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
134:     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
135:     const PetscScalar Bdd = *pBdd;
136:     const PetscInt    offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
137:     if (threadIdx.x==0) {
138:       for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) { /* assuming symmetric structure */
139:         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
140:         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
141:         *Aid = *Aid/Bdd;
142:       }
143:     }
144:     __syncthreads(); // synch on threadIdx.x only
145:     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) {
146:       const PetscInt    bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
147:       PetscScalar       *Aid = ba_csr + bi_csr[myi] + kIdx;
148:       PetscScalar       *Aij =  Aid + 1;
149:       const PetscScalar Lid  = *Aid;
150:       for (int jIdx=threadIdx.x ; jIdx<nzUd; jIdx += blockDim.x) {
151:         Aij[jIdx] -= Lid*baUd[jIdx];
152:       }
153:     }
154: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
155:     if (use_group_sync) {
156:       g.sync();
157:     } else {
158:       __syncthreads();
159:     }
160: #else
161:     __syncthreads();
162: #endif
163:   } /* endof for (i=0; i<n; i++) { */
164: }

166: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
167: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
168: {
169:   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
170:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
171:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
172:   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
173:   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
174:   CsrMatrix                    *matrixA;
175:   PetscErrorCode               ierr;
176:   cudaError_t                  cerr;
177:   const PetscInt               n=A->rmap->n, *ic, *r;
178:   const int                    *ai_d, *aj_d;
179:   const PetscScalar            *aa_d;
180:   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
181:   int                          *bi_t = cusparseTriFactors->i_band_d;
182:   PetscContainer               container;
183:   int                          Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;

186:   if (A->rmap->n == 0) {
187:     return(0);
188:   }
189:   // cusparse setup
190:   if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
191:   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
192:   if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
193:   matrixA = (CsrMatrix*)matstructA->mat;
194:   if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");

196:   // factor: get Nf if available
197:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
198:   if (container) {
199:     PetscInt *pNf=NULL;
200:     PetscContainerGetPointer(container, (void **) &pNf);
201:     Nf = (*pNf)%1000;
202:     if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
203:   } else Nf = 1;
204:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

206:   // get data
207:   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
208:   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
209:   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
210:   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
211:   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());

213:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
214:   PetscLogGpuTimeBegin();
215:   {
216:     int bw = (int)(2.*(double)n-1. - (double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
217: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
218:     Ni = 1/nconcurrent;
219:     Ni = 1;
220: #else
221:     if (!cusparseTriFactors->init_dev_prop) {
222:       int gpuid;
223:       cusparseTriFactors->init_dev_prop = PETSC_TRUE;
224:       cudaGetDevice(&gpuid);
225:       cudaGetDeviceProperties(&cusparseTriFactors->dev_prop, gpuid);
226:     }
227:     nsm = cusparseTriFactors->dev_prop.multiProcessorCount;
228:     Ni = nsm/Nf/nconcurrent;
229: #endif
230:     team_size = bw/Ni + !!(bw%Ni);
231:     nVec = PetscMin(bw, 1024/team_size);
232:     PetscInfo7(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d, thread group size = %d,%d\n",bw,Ni,nconcurrent,Nf,nsm,team_size,nVec);
233:     {
234:       dim3 dimBlockTeam(nVec,team_size);
235:       dim3 dimBlockLeague(Nf,Ni);
236:       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
237:       CHECK_LAUNCH_ERROR(); // does a sync
238: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
239:       if (Ni > 1) {
240:         void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t, (void*)&nsm };
241:         cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, 0, NULL);
242:       } else {
243:         mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
244:       }
245: #else
246:       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
247: #endif
248:       CHECK_LAUNCH_ERROR(); // does a sync
249: #if defined(PETSC_USE_LOG)
250:       PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(PetscLogDouble)(2*bm1 + 1)/3 + (PetscLogDouble)2*(nl-bw)*bw*bw + (PetscLogDouble)nl*(nl+1)/2));
251: #endif
252:     }
253:   }
254:   PetscLogGpuTimeEnd();
255:   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
256:   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
257:   B->ops->solvetranspose = NULL; // need transpose
258:   B->ops->matsolve = NULL;
259:   B->ops->matsolvetranspose = NULL;
260:   return(0);
261: }

263: static PetscErrorCode MatrixNfDestroy(void *ptr)
264: {
265:   PetscInt *nf = (PetscInt *)ptr;
266:   PetscErrorCode  ierr;
268:   PetscFree(nf);
269:   return(0);
270: }

272: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
273: {
274:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
275:   IS                 isicol;
276:   PetscErrorCode     ierr;
277:   cudaError_t        cerr;
278:   const PetscInt     *ic,*ai=a->i,*aj=a->j;
279:   PetscScalar        *ba_t;
280:   int                *bi_t;
281:   PetscInt           i,n=A->rmap->n,Nf;
282:   PetscInt           nzBcsr,bwL,bwU;
283:   PetscBool          missing;
284:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
285:   PetscContainer               container;

288:   if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
289:   MatMissingDiagonal(A,&missing,&i);
290:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
291:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
292:   MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);
293:   if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");

295:    // factor: get Nf if available
296:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
297:   if (container) {
298:     PetscInt *pNf=NULL;
299:     PetscContainerGetPointer(container, (void **) &pNf);
300:     Nf = (*pNf)%1000;
301:     PetscContainerCreate(PETSC_COMM_SELF, &container);
302:     PetscMalloc(sizeof(PetscInt), &pNf);
303:     *pNf = Nf;
304:     PetscContainerSetPointer(container, (void *)pNf);
305:     PetscContainerSetUserDestroy(container, MatrixNfDestroy);
306:     PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);
307:     PetscContainerDestroy(&container);
308:   } else Nf = 1;
309:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

311:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
312:   ISGetIndices(isicol,&ic);

314:   MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
315:   PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);
316:   b    = (Mat_SeqAIJ*)(B)->data;

318:   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
319:   bwL = bwU = 0;
320:   for (int rwb=0; rwb<n; rwb++) {
321:     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
322:     for (int j=0;j<anz;j++) {
323:       PetscInt colb = ic[ajtmp[j]];
324:       if (colb<rwa) { // L
325:         if (rwa-colb > bwL) bwL = rwa-colb;
326:       } else {
327:         if (colb-rwa > bwU) bwU = colb-rwa;
328:       }
329:     }
330:   }
331:   ISRestoreIndices(isicol,&ic);
332:   /* only support structurally symmetric, but it might work */
333:   if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
334:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
335:   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
336:   b->maxnz = b->nz = nzBcsr;
337:   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
338:   PetscInfo2(A,"Matrix Bandwidth = %D, nnz = %D\n",bwL,b->nz);
339:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
340:   cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // include a place for flops
341:   cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
342:   cusparseTriFactors->a_band_d = ba_t;
343:   cusparseTriFactors->i_band_d = bi_t;
344:   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
345:   PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));
346:   {
347:     dim3 dimBlockTeam(1,128);
348:     dim3 dimBlockLeague(Nf,1);
349:     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
350:   }
351:   CHECK_LAUNCH_ERROR(); // does a sync

353:   // setup data
354:   if (!cusparseTriFactors->rpermIndices) {
355:     const PetscInt *r;

357:     ISGetIndices(isrow,&r);
358:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
359:     cusparseTriFactors->rpermIndices->assign(r, r+n);
360:     ISRestoreIndices(isrow,&r);
361:     PetscLogCpuToGpu(n*sizeof(PetscInt));
362:   }
363:   /* upper triangular indices */
364:   if (!cusparseTriFactors->cpermIndices) {
365:     const PetscInt *c;

367:     ISGetIndices(isicol,&c);
368:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
369:     cusparseTriFactors->cpermIndices->assign(c, c+n);
370:     ISRestoreIndices(isicol,&c);
371:     PetscLogCpuToGpu(n*sizeof(PetscInt));
372:   }

374:   /* put together the new matrix */
375:   b->free_a       = PETSC_FALSE;
376:   b->free_ij      = PETSC_FALSE;
377:   b->singlemalloc = PETSC_FALSE;
378:   b->ilen = NULL;
379:   b->imax = NULL;
380:   b->row  = isrow;
381:   b->col  = iscol;
382:   PetscObjectReference((PetscObject)isrow);
383:   PetscObjectReference((PetscObject)iscol);
384:   b->icol = isicol;
385:   PetscMalloc1(n+1,&b->solve_work);

387:   B->factortype            = MAT_FACTOR_LU;
388:   B->info.factor_mallocs   = 0;
389:   B->info.fill_ratio_given = 0;

391:   if (ai[n]) {
392:     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
393:   } else {
394:     B->info.fill_ratio_needed = 0.0;
395:   }
396: #if defined(PETSC_USE_INFO)
397:   if (ai[n] != 0) {
398:     PetscReal af = B->info.fill_ratio_needed;
399:     PetscInfo1(A,"Band fill ratio %g\n",(double)af);
400:   } else {
401:     PetscInfo(A,"Empty matrix\n");
402:   }
403: #endif
404:   if (a->inode.size) {
405:     PetscInfo(A,"Warning: using inodes in band solver.\n");
406:   }
407:   MatSeqAIJCheckInode_FactorLU(B);
408:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
409:   B->offloadmask = PETSC_OFFLOAD_GPU;

411:   return(0);
412: }

414: /* Use -pc_factor_mat_solver_type cusparseband */
415: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
416: {
418:   *type = MATSOLVERCUSPARSEBAND;
419:   return(0);
420: }

422: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
423: {
425:   PetscInt       n = A->rmap->n;

428:   MatCreate(PetscObjectComm((PetscObject)A),B);
429:   MatSetSizes(*B,n,n,n,n);
430:   (*B)->factortype = ftype;
431:   (*B)->canuseordering = PETSC_TRUE;
432:   MatSetType(*B,MATSEQAIJCUSPARSE);

434:   if (ftype == MAT_FACTOR_LU) {
435:     MatSetBlockSizesFromMats(*B,A,A);
436:     (*B)->ops->ilufactorsymbolic = NULL; // MatILUFactorSymbolic_SeqAIJCUSPARSE;
437:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
438:     PetscStrallocpy(MATORDERINGRCM,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
439:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");

441:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
442:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);
443:   return(0);
444: }

446: #define WARP_SIZE 32
447: template <typename T>
448: __forceinline__ __device__
449: T wreduce(T a)
450: {
451:   T b;
452:   #pragma unroll
453:   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
454:     b = __shfl_down_sync(0xffffffff, a, i);
455:     a += b;
456:   }
457:   return a;
458: }
459: // reduce in a block, returns result in thread 0
460: template <typename T, int BLOCK_SIZE>
461: __device__
462: T breduce(T a)
463: {
464:   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
465:   __shared__ double buf[NWARP];
466:   int wid = threadIdx.x / WARP_SIZE;
467:   int laneid = threadIdx.x % WARP_SIZE;
468:   T b = wreduce<T>(a);
469:   if (laneid == 0)
470:     buf[wid] = b;
471:   __syncthreads();
472:   if (wid == 0) {
473:     if (threadIdx.x < NWARP)
474:       a = buf[threadIdx.x];
475:     else
476:       a = 0;
477:     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
478:       a += __shfl_down_sync(0xffffffff, a, i);
479:     }
480:   }
481:   return a;
482: }

484: // Band LU kernel ---  ba_csr bi_csr
485: template <int BLOCK_SIZE>
486: __global__
487: void __launch_bounds__(256,1)
488: mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
489: {
490:   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
491:   const PetscScalar *pLi;
492:   const int tid = threadIdx.x;

494:   /* Next, solve L */
495:   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
496:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
497:     const PetscInt col = locDD<bw ? start : (glbDD-bw);
498:     PetscScalar t = 0;
499:     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
500:       t += pLi[idx]*x[j];
501:     }
502: #if defined(PETSC_USE_COMPLEX)
503:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
504:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
505:     t = tt;
506: #else
507:     t = breduce<PetscReal,BLOCK_SIZE>(t);
508: #endif
509:     if (threadIdx.x == 0)
510:       x[glbDD] -= t; // /1.0
511:     __syncthreads();
512:     // inc
513:     pLi += glbDD-col; // get to diagonal
514:     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
515:     else pLi += bw;
516:     pLi += 1; // skip to next row
517:     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
518:   }
519:   /* Then, solve U */
520:   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
521:   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row

523:   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
524:     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
525:     PetscScalar t = 0;
526:     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
527:       t += pLi[-idx]*x[j];
528:     }
529: #if defined(PETSC_USE_COMPLEX)
530:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
531:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
532:     t = tt;
533: #else
534:     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
535: #endif
536:     pLi -= col-glbDD; // diagonal
537:     if (threadIdx.x == 0) {
538:       x[glbDD] -= t;
539:       x[glbDD] /= pLi[0];
540:     }
541:     __syncthreads();
542:     // inc past L to start of previous U
543:     pLi -= bw+1;
544:     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
545:     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
546:   }
547: }

549: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
550: {
551:   const PetscScalar                     *barray;
552:   PetscScalar                           *xarray;
553:   thrust::device_ptr<const PetscScalar> bGPU;
554:   thrust::device_ptr<PetscScalar>       xGPU;
555:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
556:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
557:   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, Nf;
558:   PetscInt                              bw = (int)(2.*(double)n-1.-(double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)nz))+PETSC_MACHINE_EPSILON))/2; // quadric formula for bandwidth
559:   PetscErrorCode                        ierr;
560:   PetscContainer                        container;

563:   if (A->rmap->n == 0) {
564:     return(0);
565:   }
566:   // factor: get Nf if available
567:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
568:   if (container) {
569:     PetscInt *pNf=NULL;
570:     PetscContainerGetPointer(container, (void **) &pNf);
571:     Nf = (*pNf)%1000;
572:   } else Nf = 1;
573:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n(%D) % Nf(%D) != 0",n,Nf);

575:   /* Get the GPU pointers */
576:   VecCUDAGetArrayWrite(xx,&xarray);
577:   VecCUDAGetArrayRead(bb,&barray);
578:   xGPU = thrust::device_pointer_cast(xarray);
579:   bGPU = thrust::device_pointer_cast(barray);

581:   PetscLogGpuTimeBegin();
582:   /* First, reorder with the row permutation */
583:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
584:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
585:                tempGPU->begin());
586:   constexpr int block = 128;
587:   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
588:   CHECK_LAUNCH_ERROR(); // does a sync

590:   /* Last, reorder with the column permutation */
591:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
592:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
593:                xGPU);

595:   VecCUDARestoreArrayRead(bb,&barray);
596:   VecCUDARestoreArrayWrite(xx,&xarray);
597:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
598:   PetscLogGpuTimeEnd();

600:   return(0);
601: }