Actual source code: sfhip.hip.cpp

  1: #include <../src/vec/is/sf/impls/basic/sfpack.h>

  3: /* compilation issues on SPOCK */
  4: #undef PETSC_HAVE_COMPLEX

  6: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
  7: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
  8: {
  9:   PetscInt        i,j,k,m,n,r;
 10:   const PetscInt  *offset,*start,*dx,*dy,*X,*Y;

 12:   n      = opt[0];
 13:   offset = opt + 1;
 14:   start  = opt + n + 2;
 15:   dx     = opt + 2*n + 2;
 16:   dy     = opt + 3*n + 2;
 17:   X      = opt + 5*n + 2;
 18:   Y      = opt + 6*n + 2;
 19:   for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
 20:   m = (tid - offset[r]);
 21:   k = m/(dx[r]*dy[r]);
 22:   j = (m - k*dx[r]*dy[r])/dx[r];
 23:   i = m - k*dx[r]*dy[r] - j*dx[r];

 25:   return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
 26: }

 28: /*====================================================================================*/
 29: /*  Templated HIP kernels for pack/unpack. The Op can be regular or atomic           */
 30: /*====================================================================================*/

 32: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
 33:    <Type> is PetscReal, which is the primitive type we operate on.
 34:    <bs>   is 16, which says <unit> contains 16 primitive types.
 35:    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
 36:    <EQ>   is 0, which is (bs == BS ? 1 : 0)

 38:   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
 39:   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
 40: */
 41: template<class Type,PetscInt BS,PetscInt EQ>
 42: __global__ static void d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,const Type *data,Type *buf)
 43: {
 44:   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 45:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 46:   const PetscInt  M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
 47:   const PetscInt  MBS = M*BS;  /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */

 49:   for (; tid<count; tid += grid_size) {
 50:     /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
 51:        opt == NULL && idx == NULL ==> the indices are contiguous;
 52:      */
 53:     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
 54:     s = tid*MBS;
 55:     for (i=0; i<MBS; i++) buf[s+i] = data[t+i];
 56:   }
 57: }

 59: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 60: __global__ static void d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,Type *data,const Type *buf)
 61: {
 62:   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 63:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 64:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 65:   Op              op;

 67:   for (; tid<count; tid += grid_size) {
 68:     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
 69:     s = tid*MBS;
 70:     for (i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
 71:   }
 72: }

 74: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 75: __global__ static void d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,Type *leafbuf)
 76: {
 77:   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
 78:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 79:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 80:   Op              op;

 82:   for (; tid<count; tid += grid_size) {
 83:     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
 84:     l = tid*MBS;
 85:     for (i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
 86:   }
 87: }

 89: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 90: __global__ static void d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt* srcIdx,const Type *src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt *dstIdx,Type *dst)
 91: {
 92:   PetscInt        i,j,k,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 93:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 94:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 95:   Op              op;

 97:   for (; tid<count; tid += grid_size) {
 98:     if (!srcIdx) { /* src is either contiguous or 3D */
 99:       k = tid/(srcx*srcy);
100:       j = (tid - k*srcx*srcy)/srcx;
101:       i = tid - k*srcx*srcy - j*srcx;
102:       s = srcStart + k*srcX*srcY + j*srcX + i;
103:     } else {
104:       s = srcIdx[tid];
105:     }

107:     if (!dstIdx) { /* dst is either contiguous or 3D */
108:       k = tid/(dstx*dsty);
109:       j = (tid - k*dstx*dsty)/dstx;
110:       i = tid - k*dstx*dsty - j*dstx;
111:       t = dstStart + k*dstX*dstY + j*dstX + i;
112:     } else {
113:       t = dstIdx[tid];
114:     }

116:     s *= MBS;
117:     t *= MBS;
118:     for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
119:   }
120: }

122: template<class Type,class Op,PetscInt BS,PetscInt EQ>
123: __global__ static void d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,PetscInt leafstart,const PetscInt *leafopt,const PetscInt *leafidx,const Type *leafdata,Type *leafupdate)
124: {
125:   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
126:   const PetscInt  grid_size = gridDim.x * blockDim.x;
127:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
128:   Op              op;

130:   for (; tid<count; tid += grid_size) {
131:     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
132:     l = (leafopt? MapTidToIndex(leafopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
133:     for (i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
134:   }
135: }

137: /*====================================================================================*/
138: /*                             Regular operations on device                           */
139: /*====================================================================================*/
140: template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = y;             return old;}};
141: template<typename Type> struct Add    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y;             return old;}};
142: template<typename Type> struct Mult   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y;             return old;}};
143: template<typename Type> struct Min    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMin(x,y); return old;}};
144: template<typename Type> struct Max    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMax(x,y); return old;}};
145: template<typename Type> struct LAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x && y;        return old;}};
146: template<typename Type> struct LOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x || y;        return old;}};
147: template<typename Type> struct LXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = !x != !y;      return old;}};
148: template<typename Type> struct BAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x & y;         return old;}};
149: template<typename Type> struct BOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x | y;         return old;}};
150: template<typename Type> struct BXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x ^ y;         return old;}};
151: template<typename Type> struct Minloc {
152:   __device__ Type operator() (Type& x,Type y) const {
153:     Type old = x;
154:     if (y.a < x.a) x = y;
155:     else if (y.a == x.a) x.b = min(x.b,y.b);
156:     return old;
157:   }
158: };
159: template<typename Type> struct Maxloc {
160:   __device__ Type operator() (Type& x,Type y) const {
161:     Type old = x;
162:     if (y.a > x.a) x = y;
163:     else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
164:     return old;
165:   }
166: };

168: /*====================================================================================*/
169: /*                             Atomic operations on device                            */
170: /*====================================================================================*/

172: /*
173:   Atomic Insert (exchange) operations

175:   See Cuda version
176: */
177: __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((ullint*)address,__double_as_longlong(val)));}

179: __device__ static llint atomicExch(llint* address,llint val) {return (llint)(atomicExch((ullint*)address,(ullint)val));}

181: template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};

183: #if defined(PETSC_HAVE_COMPLEX)
184: #if defined(PETSC_USE_REAL_DOUBLE)
185: template<> struct AtomicInsert<PetscComplex> {
186:   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
187:     PetscComplex         old, *z = &old;
188:     double               *xp = (double*)&x,*yp = (double*)&y;
189:     AtomicInsert<double> op;
190:     z[0] = op(xp[0],yp[0]);
191:     z[1] = op(xp[1],yp[1]);
192:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
193:   }
194: };
195: #elif defined(PETSC_USE_REAL_SINGLE)
196: template<> struct AtomicInsert<PetscComplex> {
197:   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
198:     double               *xp = (double*)&x,*yp = (double*)&y;
199:     AtomicInsert<double> op;
200:     return op(xp[0],yp[0]);
201:   }
202: };
203: #endif
204: #endif

206: /*
207:   Atomic add operations

209: */
210: __device__ static llint atomicAdd(llint* address,llint val) {return (llint)atomicAdd((ullint*)address,(ullint)val);}

212: template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};

214: template<> struct AtomicAdd<double> {
215:   __device__ double operator() (double& x,double y) const {
216:    /* Cuda version does more checks that may be needed */
217:     return atomicAdd(&x,y);
218:   }
219: };

221: template<> struct AtomicAdd<float> {
222:   __device__ float operator() (float& x,float y) const {
223:     /* Cuda version does more checks that may be needed */
224:     return atomicAdd(&x,y);
225:   }
226: };

228: #if defined(PETSC_HAVE_COMPLEX)
229: template<> struct AtomicAdd<PetscComplex> {
230:  __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
231:   PetscComplex         old, *z = &old;
232:   PetscReal            *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
233:   AtomicAdd<PetscReal> op;
234:   z[0] = op(xp[0],yp[0]);
235:   z[1] = op(xp[1],yp[1]);
236:   return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
237:  }
238: };
239: #endif

241: /*
242:   Atomic Mult operations:

244:   HIP has no atomicMult at all, so we build our own with atomicCAS
245:  */
246: #if defined(PETSC_USE_REAL_DOUBLE)
247: __device__ static double atomicMult(double* address, double val)
248: {
249:   ullint *address_as_ull = (ullint*)(address);
250:   ullint old = *address_as_ull, assumed;
251:   do {
252:     assumed = old;
253:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
254:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
255:   } while (assumed != old);
256:   return __longlong_as_double(old);
257: }
258: #elif defined(PETSC_USE_REAL_SINGLE)
259: __device__ static float atomicMult(float* address,float val)
260: {
261:   int *address_as_int = (int*)(address);
262:   int old = *address_as_int, assumed;
263:   do {
264:     assumed  = old;
265:     old      = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
266:   } while (assumed != old);
267:   return __int_as_float(old);
268: }
269: #endif

271: __device__ static int atomicMult(int* address,int val)
272: {
273:   int *address_as_int = (int*)(address);
274:   int old = *address_as_int, assumed;
275:   do {
276:     assumed = old;
277:     old     = atomicCAS(address_as_int, assumed, val*assumed);
278:   } while (assumed != old);
279:   return (int)old;
280: }

282: __device__ static llint atomicMult(llint* address,llint val)
283: {
284:   ullint *address_as_ull = (ullint*)(address);
285:   ullint old = *address_as_ull, assumed;
286:   do {
287:     assumed = old;
288:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val*(llint)assumed));
289:   } while (assumed != old);
290:   return (llint)old;
291: }

293: template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};

295: /*
296:   Atomic Min/Max operations

298:   See CUDA version for comments.
299:  */

301: #if defined(PETSC_USE_REAL_DOUBLE)
302: __device__ static double atomicMin(double* address, double val)
303: {
304:   ullint *address_as_ull = (ullint*)(address);
305:   ullint old = *address_as_ull, assumed;
306:   do {
307:     assumed = old;
308:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
309:   } while (assumed != old);
310:   return __longlong_as_double(old);
311: }

313: __device__ static double atomicMax(double* address, double val)
314: {
315:   ullint *address_as_ull = (ullint*)(address);
316:   ullint old = *address_as_ull, assumed;
317:   do {
318:     assumed  = old;
319:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
320:   } while (assumed != old);
321:   return __longlong_as_double(old);
322: }
323: #elif defined(PETSC_USE_REAL_SINGLE)
324: __device__ static float atomicMin(float* address,float val)
325: {
326:   int *address_as_int = (int*)(address);
327:   int old = *address_as_int, assumed;
328:   do {
329:     assumed = old;
330:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
331:   } while (assumed != old);
332:   return __int_as_float(old);
333: }

335: __device__ static float atomicMax(float* address,float val)
336: {
337:   int *address_as_int = (int*)(address);
338:   int old = *address_as_int, assumed;
339:   do {
340:     assumed = old;
341:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
342:   } while (assumed != old);
343:   return __int_as_float(old);
344: }
345: #endif

347: /* As of ROCm 3.10 llint atomicMin/Max(llint*, llint) is not supported */
348: __device__ static llint atomicMin(llint* address,llint val)
349: {
350:   ullint *address_as_ull = (ullint*)(address);
351:   ullint old = *address_as_ull, assumed;
352:   do {
353:     assumed = old;
354:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val,(llint)assumed)));
355:   } while (assumed != old);
356:   return (llint)old;
357: }

359: __device__ static llint atomicMax(llint* address,llint val)
360: {
361:   ullint *address_as_ull = (ullint*)(address);
362:   ullint old = *address_as_ull, assumed;
363:   do {
364:     assumed = old;
365:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val,(llint)assumed)));
366:   } while (assumed != old);
367:   return (llint)old;
368: }

370: template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
371: template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};

373: /*
374:   Atomic bitwise operations
375:   As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
376: */

378: __device__ static llint atomicAnd(llint* address,llint val)
379: {
380:   ullint *address_as_ull = (ullint*)(address);
381:   ullint old = *address_as_ull, assumed;
382:   do {
383:     assumed = old;
384:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
385:   } while (assumed != old);
386:   return (llint)old;
387: }
388: __device__ static llint atomicOr(llint* address,llint val)
389: {
390:   ullint *address_as_ull = (ullint*)(address);
391:   ullint old = *address_as_ull, assumed;
392:   do {
393:     assumed = old;
394:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
395:   } while (assumed != old);
396:   return (llint)old;
397: }

399: __device__ static llint atomicXor(llint* address,llint val)
400: {
401:   ullint *address_as_ull = (ullint*)(address);
402:   ullint old = *address_as_ull, assumed;
403:   do {
404:     assumed = old;
405:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
406:   } while (assumed != old);
407:   return (llint)old;
408: }

410: template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
411: template<typename Type> struct AtomicBOR  {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
412: template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};

414: /*
415:   Atomic logical operations:

417:   CUDA has no atomic logical operations at all. We support them on integer types.
418: */

420: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
421:    which is what we want since we only support 32-bit and 64-bit integers.
422:  */
423: template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;

425: template<typename Type,class Op>
426: struct AtomicLogical<Type,Op,4> {
427:   __device__ Type operator()(Type& x,Type y) const {
428:     int *address_as_int = (int*)(&x);
429:     int old = *address_as_int, assumed;
430:     Op op;
431:     do {
432:       assumed = old;
433:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
434:     } while (assumed != old);
435:     return (Type)old;
436:   }
437: };

439: template<typename Type,class Op>
440: struct AtomicLogical<Type,Op,8> {
441:   __device__ Type operator()(Type& x,Type y) const {
442:     ullint *address_as_ull = (ullint*)(&x);
443:     ullint old = *address_as_ull, assumed;
444:     Op op;
445:     do {
446:       assumed = old;
447:       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed,y)));
448:     } while (assumed != old);
449:     return (Type)old;
450:   }
451: };

453: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
454: template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
455: template<typename Type> struct lor  {__device__ Type operator()(Type x, Type y) {return x || y;}};
456: template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};

458: template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
459: template<typename Type> struct AtomicLOR  {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
460: template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};

462: /*====================================================================================*/
463: /*  Wrapper functions of hip kernels. Function pointers are stored in 'link'         */
464: /*====================================================================================*/
465: template<typename Type,PetscInt BS,PetscInt EQ>
466: static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data,void *buf)
467: {
468:   hipError_t         err;
469:   PetscInt           nthreads=256;
470:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
471:   const PetscInt     *iarray=opt ? opt->array : NULL;

474:   if (!count) return(0);
475:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
476:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_Pack<Type,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,start,iarray,idx,(const Type*)data,(Type*)buf);
477:   err = hipGetLastError();CHKERRHIP(err);
478:   return(0);
479: }

481: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
482: static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
483: {
484:   hipError_t         cerr;
485:   PetscInt           nthreads=256;
486:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
487:   const PetscInt     *iarray=opt ? opt->array : NULL;

490:   if (!count) return(0);
491:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
492:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_UnpackAndOp<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
493:   cerr = hipGetLastError();CHKERRHIP(cerr);
494:   return(0);
495: }

497: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
498: static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
499: {
500:   hipError_t         cerr;
501:   PetscInt           nthreads=256;
502:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
503:   const PetscInt     *iarray=opt ? opt->array : NULL;

506:   if (!count) return(0);
507:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
508:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_FetchAndOp<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,start,iarray,idx,(Type*)data,(Type*)buf);
509:   cerr = hipGetLastError();CHKERRHIP(cerr);
510:   return(0);
511: }

513: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
514: static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
515: {
516:   hipError_t         cerr;
517:   PetscInt           nthreads=256;
518:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
519:   PetscInt           srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;

522:   if (!count) return(0);
523:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);

525:   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
526:   if (srcOpt)       {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
527:   else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}

529:   if (dstOpt)       {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
530:   else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}

532:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_ScatterAndOp<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,srcx,srcy,srcX,srcY,srcStart,srcIdx,(const Type*)src,dstx,dsty,dstX,dstY,dstStart,dstIdx,(Type*)dst);
533:   cerr = hipGetLastError();CHKERRHIP(cerr);
534:   return(0);
535: }

537: /* Specialization for Insert since we may use hipMemcpyAsync */
538: template<typename Type,PetscInt BS,PetscInt EQ>
539: static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
540: {
541:   PetscErrorCode    ierr;
542:   hipError_t       cerr;

545:   if (!count) return(0);
546:   /*src and dst are contiguous */
547:   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
548:     cerr = hipMemcpyAsync((Type*)dst+dstStart*link->bs,(const Type*)src+srcStart*link->bs,count*link->unitbytes,hipMemcpyDeviceToDevice,link->stream);CHKERRHIP(cerr);
549:   } else {
550:     ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);
551:   }
552:   return(0);
553: }

555: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
556: static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate)
557: {
558:   hipError_t        cerr;
559:   PetscInt          nthreads=256;
560:   PetscInt          nblocks=(count+nthreads-1)/nthreads;
561:   const PetscInt    *rarray = rootopt ? rootopt->array : NULL;
562:   const PetscInt    *larray = leafopt ? leafopt->array : NULL;

565:   if (!count) return(0);
566:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
567:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_FetchAndOpLocal<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,rootstart,rarray,rootidx,(Type*)rootdata,leafstart,larray,leafidx,(const Type*)leafdata,(Type*)leafupdate);
568:   cerr = hipGetLastError();CHKERRHIP(cerr);
569:   return(0);
570: }

572: /*====================================================================================*/
573: /*  Init various types and instantiate pack/unpack function pointers                  */
574: /*====================================================================================*/
575: template<typename Type,PetscInt BS,PetscInt EQ>
576: static void PackInit_RealType(PetscSFLink link)
577: {
578:   /* Pack/unpack for remote communication */
579:   link->d_Pack              = Pack<Type,BS,EQ>;
580:   link->d_UnpackAndInsert   = UnpackAndOp     <Type,Insert<Type>      ,BS,EQ>;
581:   link->d_UnpackAndAdd      = UnpackAndOp     <Type,Add<Type>         ,BS,EQ>;
582:   link->d_UnpackAndMult     = UnpackAndOp     <Type,Mult<Type>        ,BS,EQ>;
583:   link->d_UnpackAndMin      = UnpackAndOp     <Type,Min<Type>         ,BS,EQ>;
584:   link->d_UnpackAndMax      = UnpackAndOp     <Type,Max<Type>         ,BS,EQ>;
585:   link->d_FetchAndAdd       = FetchAndOp      <Type,Add<Type>         ,BS,EQ>;

587:   /* Scatter for local communication */
588:   link->d_ScatterAndInsert  = ScatterAndInsert<Type                   ,BS,EQ>; /* Has special optimizations */
589:   link->d_ScatterAndAdd     = ScatterAndOp    <Type,Add<Type>         ,BS,EQ>;
590:   link->d_ScatterAndMult    = ScatterAndOp    <Type,Mult<Type>        ,BS,EQ>;
591:   link->d_ScatterAndMin     = ScatterAndOp    <Type,Min<Type>         ,BS,EQ>;
592:   link->d_ScatterAndMax     = ScatterAndOp    <Type,Max<Type>         ,BS,EQ>;
593:   link->d_FetchAndAddLocal  = FetchAndOpLocal <Type,Add <Type>        ,BS,EQ>;

595:   /* Atomic versions when there are data-race possibilities */
596:   link->da_UnpackAndInsert  = UnpackAndOp     <Type,AtomicInsert<Type>,BS,EQ>;
597:   link->da_UnpackAndAdd     = UnpackAndOp     <Type,AtomicAdd<Type>   ,BS,EQ>;
598:   link->da_UnpackAndMult    = UnpackAndOp     <Type,AtomicMult<Type>  ,BS,EQ>;
599:   link->da_UnpackAndMin     = UnpackAndOp     <Type,AtomicMin<Type>   ,BS,EQ>;
600:   link->da_UnpackAndMax     = UnpackAndOp     <Type,AtomicMax<Type>   ,BS,EQ>;
601:   link->da_FetchAndAdd      = FetchAndOp      <Type,AtomicAdd<Type>   ,BS,EQ>;

603:   link->da_ScatterAndInsert = ScatterAndOp    <Type,AtomicInsert<Type>,BS,EQ>;
604:   link->da_ScatterAndAdd    = ScatterAndOp    <Type,AtomicAdd<Type>   ,BS,EQ>;
605:   link->da_ScatterAndMult   = ScatterAndOp    <Type,AtomicMult<Type>  ,BS,EQ>;
606:   link->da_ScatterAndMin    = ScatterAndOp    <Type,AtomicMin<Type>   ,BS,EQ>;
607:   link->da_ScatterAndMax    = ScatterAndOp    <Type,AtomicMax<Type>   ,BS,EQ>;
608:   link->da_FetchAndAddLocal = FetchAndOpLocal <Type,AtomicAdd<Type>   ,BS,EQ>;
609: }

611: /* Have this templated class to specialize for char integers */
612: template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
613: struct PackInit_IntegerType_Atomic {
614:   static void Init(PetscSFLink link)
615:   {
616:     link->da_UnpackAndInsert  = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
617:     link->da_UnpackAndAdd     = UnpackAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
618:     link->da_UnpackAndMult    = UnpackAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
619:     link->da_UnpackAndMin     = UnpackAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
620:     link->da_UnpackAndMax     = UnpackAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
621:     link->da_UnpackAndLAND    = UnpackAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
622:     link->da_UnpackAndLOR     = UnpackAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
623:     link->da_UnpackAndLXOR    = UnpackAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
624:     link->da_UnpackAndBAND    = UnpackAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
625:     link->da_UnpackAndBOR     = UnpackAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
626:     link->da_UnpackAndBXOR    = UnpackAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
627:     link->da_FetchAndAdd      = FetchAndOp <Type,AtomicAdd<Type>   ,BS,EQ>;

629:     link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
630:     link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
631:     link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
632:     link->da_ScatterAndMin    = ScatterAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
633:     link->da_ScatterAndMax    = ScatterAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
634:     link->da_ScatterAndLAND   = ScatterAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
635:     link->da_ScatterAndLOR    = ScatterAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
636:     link->da_ScatterAndLXOR   = ScatterAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
637:     link->da_ScatterAndBAND   = ScatterAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
638:     link->da_ScatterAndBOR    = ScatterAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
639:     link->da_ScatterAndBXOR   = ScatterAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
640:     link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicAdd<Type>,BS,EQ>;
641:   }
642: };

644: /*  See cuda version */
645: template<typename Type,PetscInt BS,PetscInt EQ>
646: struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
647:   static void Init(PetscSFLink link) {/* Nothing to leave function pointers NULL */}
648: };

650: template<typename Type,PetscInt BS,PetscInt EQ>
651: static void PackInit_IntegerType(PetscSFLink link)
652: {
653:   link->d_Pack            = Pack<Type,BS,EQ>;
654:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
655:   link->d_UnpackAndAdd    = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
656:   link->d_UnpackAndMult   = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
657:   link->d_UnpackAndMin    = UnpackAndOp<Type,Min<Type>   ,BS,EQ>;
658:   link->d_UnpackAndMax    = UnpackAndOp<Type,Max<Type>   ,BS,EQ>;
659:   link->d_UnpackAndLAND   = UnpackAndOp<Type,LAND<Type>  ,BS,EQ>;
660:   link->d_UnpackAndLOR    = UnpackAndOp<Type,LOR<Type>   ,BS,EQ>;
661:   link->d_UnpackAndLXOR   = UnpackAndOp<Type,LXOR<Type>  ,BS,EQ>;
662:   link->d_UnpackAndBAND   = UnpackAndOp<Type,BAND<Type>  ,BS,EQ>;
663:   link->d_UnpackAndBOR    = UnpackAndOp<Type,BOR<Type>   ,BS,EQ>;
664:   link->d_UnpackAndBXOR   = UnpackAndOp<Type,BXOR<Type>  ,BS,EQ>;
665:   link->d_FetchAndAdd     = FetchAndOp <Type,Add<Type>   ,BS,EQ>;

667:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
668:   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
669:   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
670:   link->d_ScatterAndMin    = ScatterAndOp<Type,Min<Type>   ,BS,EQ>;
671:   link->d_ScatterAndMax    = ScatterAndOp<Type,Max<Type>   ,BS,EQ>;
672:   link->d_ScatterAndLAND   = ScatterAndOp<Type,LAND<Type>  ,BS,EQ>;
673:   link->d_ScatterAndLOR    = ScatterAndOp<Type,LOR<Type>   ,BS,EQ>;
674:   link->d_ScatterAndLXOR   = ScatterAndOp<Type,LXOR<Type>  ,BS,EQ>;
675:   link->d_ScatterAndBAND   = ScatterAndOp<Type,BAND<Type>  ,BS,EQ>;
676:   link->d_ScatterAndBOR    = ScatterAndOp<Type,BOR<Type>   ,BS,EQ>;
677:   link->d_ScatterAndBXOR   = ScatterAndOp<Type,BXOR<Type>  ,BS,EQ>;
678:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
679:   PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
680: }

682: #if defined(PETSC_HAVE_COMPLEX)
683: template<typename Type,PetscInt BS,PetscInt EQ>
684: static void PackInit_ComplexType(PetscSFLink link)
685: {
686:   link->d_Pack             = Pack<Type,BS,EQ>;
687:   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
688:   link->d_UnpackAndAdd     = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
689:   link->d_UnpackAndMult    = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
690:   link->d_FetchAndAdd      = FetchAndOp <Type,Add<Type>   ,BS,EQ>;

692:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
693:   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
694:   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
695:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;

697:   link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
698:   link->da_UnpackAndAdd    = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
699:   link->da_UnpackAndMult   = NULL; /* Not implemented yet */
700:   link->da_FetchAndAdd     = NULL; /* Return value of atomicAdd on complex is not atomic */

702:   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
703:   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>,BS,EQ>;
704: }
705: #endif

707: typedef signed char                      SignedChar;
708: typedef unsigned char                    UnsignedChar;
709: typedef struct {int a;      int b;     } PairInt;
710: typedef struct {PetscInt a; PetscInt b;} PairPetscInt;

712: template<typename Type>
713: static void PackInit_PairType(PetscSFLink link)
714: {
715:   link->d_Pack            = Pack<Type,1,1>;
716:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,1,1>;
717:   link->d_UnpackAndMaxloc = UnpackAndOp<Type,Maxloc<Type>,1,1>;
718:   link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;

720:   link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
721:   link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
722:   link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
723:   /* Atomics for pair types are not implemented yet */
724: }

726: template<typename Type,PetscInt BS,PetscInt EQ>
727: static void PackInit_DumbType(PetscSFLink link)
728: {
729:   link->d_Pack             = Pack<Type,BS,EQ>;
730:   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
731:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
732:   /* Atomics for dumb types are not implemented yet */
733: }

735: /* Some device-specific utilities */
736: static PetscErrorCode PetscSFLinkSyncDevice_HIP(PetscSFLink link)
737: {
738:   hipError_t cerr;
740:   cerr = hipDeviceSynchronize();CHKERRHIP(cerr);
741:   return(0);
742: }

744: static PetscErrorCode PetscSFLinkSyncStream_HIP(PetscSFLink link)
745: {
746:   hipError_t cerr;
748:   cerr = hipStreamSynchronize(link->stream);CHKERRHIP(cerr);
749:   return(0);
750: }

752: static PetscErrorCode PetscSFLinkMemcpy_HIP(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
753: {
755:   enum hipMemcpyKind kinds[2][2] = {{hipMemcpyHostToHost,hipMemcpyHostToDevice},{hipMemcpyDeviceToHost,hipMemcpyDeviceToDevice}};

757:   if (n) {
758:     if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call hip runtime */
759:       PetscErrorCode PetscMemcpy(dst,src,n);
760:     } else {
761:       int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
762:       int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
763:       hipError_t cerr = hipMemcpyAsync(dst,src,n,kinds[stype][dtype],link->stream);CHKERRHIP(cerr);
764:     }
765:   }
766:   return(0);
767: }

769: PetscErrorCode PetscSFMalloc_HIP(PetscMemType mtype,size_t size,void** ptr)
770: {
772:   if (PetscMemTypeHost(mtype)) {PetscErrorCode PetscMalloc(size,ptr);}
773:   else if (PetscMemTypeDevice(mtype)) {
774:     if (!PetscHIPInitialized) { PetscErrorCode PetscHIPInitializeCheck(); }
775:     hipError_t err = hipMalloc(ptr,size);CHKERRHIP(err);
776:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
777:   return(0);
778: }

780: PetscErrorCode PetscSFFree_HIP(PetscMemType mtype,void* ptr)
781: {
783:   if (PetscMemTypeHost(mtype)) {PetscErrorCode PetscFree(ptr);}
784:   else if (PetscMemTypeDevice(mtype)) {hipError_t err = hipFree(ptr);CHKERRHIP(err);}
785:   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
786:   return(0);
787: }

789: /* Destructor when the link uses MPI for communication on HIP device */
790: static PetscErrorCode PetscSFLinkDestroy_MPI_HIP(PetscSF sf,PetscSFLink link)
791: {
792:   hipError_t    cerr;

795:   for (int i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
796:     cerr = hipFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRHIP(cerr);
797:     cerr = hipFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRHIP(cerr);
798:   }
799:   return(0);
800: }

802: /*====================================================================================*/
803: /*                Main driver to init MPI datatype on device                          */
804: /*====================================================================================*/

806: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
807: PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
808: {
810:   hipError_t     cerr;
811:   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
812:   PetscBool      is2Int,is2PetscInt;
813: #if defined(PETSC_HAVE_COMPLEX)
814:   PetscInt       nPetscComplex=0;
815: #endif

818:   if (link->deviceinited) return(0);
819:   MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);
820:   MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
821:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
822:   MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);
823:   MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
824:   MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
825: #if defined(PETSC_HAVE_COMPLEX)
826:   MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
827: #endif
828:   MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
829:   MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);

831:   if (is2Int) {
832:     PackInit_PairType<PairInt>(link);
833:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
834:     PackInit_PairType<PairPetscInt>(link);
835:   } else if (nPetscReal) {
836:     if      (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
837:     else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
838:     else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
839:     else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
840:   } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
841:     if      (nPetscInt == 8) PackInit_IntegerType<llint,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<llint,8,0>(link);
842:     else if (nPetscInt == 4) PackInit_IntegerType<llint,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<llint,4,0>(link);
843:     else if (nPetscInt == 2) PackInit_IntegerType<llint,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<llint,2,0>(link);
844:     else if (nPetscInt == 1) PackInit_IntegerType<llint,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<llint,1,0>(link);
845:   } else if (nInt) {
846:     if      (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
847:     else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
848:     else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
849:     else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
850:   } else if (nSignedChar) {
851:     if      (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
852:     else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
853:     else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
854:     else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
855:   }  else if (nUnsignedChar) {
856:     if      (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
857:     else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
858:     else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
859:     else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
860: #if defined(PETSC_HAVE_COMPLEX)
861:   } else if (nPetscComplex) {
862:     if      (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
863:     else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
864:     else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
865:     else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
866: #endif
867:   } else {
868:     MPI_Aint lb,nbyte;
869:     MPI_Type_get_extent(unit,&lb,&nbyte);
870:     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
871:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
872:       if      (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
873:       else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
874:       else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
875:     } else {
876:       nInt = nbyte / sizeof(int);
877:       if      (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
878:       else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
879:       else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
880:       else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
881:     }
882:   }

884:   if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
885:     int                   device;
886:     struct hipDeviceProp_t props;
887:     cerr = hipGetDevice(&device);CHKERRHIP(cerr);
888:     cerr = hipGetDeviceProperties(&props,device);CHKERRHIP(cerr);
889:     sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor*props.multiProcessorCount;
890:   }
891:   link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;

893:   link->stream       = PetscDefaultHipStream;
894:   link->Destroy      = PetscSFLinkDestroy_MPI_HIP;
895:   link->SyncDevice   = PetscSFLinkSyncDevice_HIP;
896:   link->SyncStream   = PetscSFLinkSyncStream_HIP;
897:   link->Memcpy       = PetscSFLinkMemcpy_HIP;
898:   link->deviceinited = PETSC_TRUE;
899:   return(0);
900: }