Actual source code: vechip2.hip.cpp
1: /*
2: Implements the sequential hip vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <petsc/private/hipvecimpl.h>
12: #include <hip/hip_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/transform.h>
15: #include <thrust/functional.h>
16: #include <thrust/reduce.h>
18: #if defined(PETSC_USE_COMPLEX)
19: /* SPOCK compilation issues, need to unroll division and multiplication with complex numbers */
20: struct PetscDivideComplex
21: {
22: __host__ __device__
23: PetscScalar operator()(const PetscScalar& lhs, const PetscScalar& rhs)
24: {
25: PetscReal lx = PetscRealPart(lhs);
26: PetscReal ly = PetscImaginaryPart(lhs);
27: PetscReal rx = PetscRealPart(rhs);
28: PetscReal ry = PetscImaginaryPart(rhs);
29: PetscReal n = rx*rx + ry*ry;
30: return PetscComplex((lx*rx + ly*ry)/n,(rx*ly - lx*ry)/n);
31: }
32: };
34: struct PetscMultiplyComplex
35: {
36: __host__ __device__
37: PetscScalar operator()(const PetscScalar& lhs, const PetscScalar& rhs)
38: {
39: PetscReal lx = PetscRealPart(lhs);
40: PetscReal ly = PetscImaginaryPart(lhs);
41: PetscReal rx = PetscRealPart(rhs);
42: PetscReal ry = PetscImaginaryPart(rhs);
43: return PetscComplex(lx*rx-ly*ry,ly*rx+lx*ry);
44: }
45: };
46: #endif
48: /*
49: Allocates space for the vector array on the GPU if it does not exist.
50: Does NOT change the PetscHIPFlag for the vector
51: Does NOT zero the HIP array
53: */
54: PetscErrorCode VecHIPAllocateCheck(Vec v)
55: {
56: Vec_HIP *vechip;
57: PetscBool option_set;
59: if (!v->spptr) {
60: PetscReal pinned_memory_min;
63: PetscCalloc(sizeof(Vec_HIP),&v->spptr);
64: vechip = (Vec_HIP*)v->spptr;
65: hipMalloc((void**)&vechip->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));
66: vechip->GPUarray = vechip->GPUarray_allocated;
67: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
68: if (v->data && ((Vec_Seq*)v->data)->array) {
69: v->offloadmask = PETSC_OFFLOAD_CPU;
70: } else {
71: v->offloadmask = PETSC_OFFLOAD_GPU;
72: }
73: }
74: pinned_memory_min = 0;
76: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
77: Note: This same code duplicated in VecCreate_SeqHIP_Private() and VecCreate_MPIHIP_Private(). Is there a good way to avoid this? */
78: PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECHIP Options","Vec");
79: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
80: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
81: PetscOptionsEnd();
82: }
83: return 0;
84: }
86: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
87: PetscErrorCode VecHIPCopyToGPU(Vec v)
88: {
89: Vec_HIP *vechip;
90: PetscScalar *varray;
93: VecHIPAllocateCheck(v);
94: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
95: PetscLogEventBegin(VEC_HIPCopyToGPU,v,0,0,0);
96: vechip = (Vec_HIP*)v->spptr;
97: varray = vechip->GPUarray;
98: hipMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),hipMemcpyHostToDevice);
99: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
100: PetscLogEventEnd(VEC_HIPCopyToGPU,v,0,0,0);
101: v->offloadmask = PETSC_OFFLOAD_BOTH;
102: }
103: return 0;
104: }
106: /*
107: VecHIPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
108: */
109: PetscErrorCode VecHIPCopyFromGPU(Vec v)
110: {
111: Vec_HIP *vechip;
112: PetscScalar *varray;
115: VecHIPAllocateCheckHost(v);
116: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
117: PetscLogEventBegin(VEC_HIPCopyFromGPU,v,0,0,0);
118: vechip = (Vec_HIP*)v->spptr;
119: varray = vechip->GPUarray;
120: hipMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),hipMemcpyDeviceToHost);
121: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
122: PetscLogEventEnd(VEC_HIPCopyFromGPU,v,0,0,0);
123: v->offloadmask = PETSC_OFFLOAD_BOTH;
124: }
125: return 0;
126: }
128: /*MC
129: VECSEQHIP - VECSEQHIP = "seqhip" - The basic sequential vector, modified to use HIP
131: Options Database Keys:
132: . -vec_type seqhip - sets the vector type to VECSEQHIP during a call to VecSetFromOptions()
134: Level: beginner
136: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
137: M*/
139: PetscErrorCode VecAYPX_SeqHIP(Vec yin,PetscScalar alpha,Vec xin)
140: {
141: const PetscScalar *xarray;
142: PetscScalar *yarray;
143: PetscBLASInt one = 1,bn = 0;
144: PetscScalar sone = 1.0;
145: hipblasHandle_t hipblasv2handle;
147: PetscHIPBLASGetHandle(&hipblasv2handle);
148: PetscBLASIntCast(yin->map->n,&bn);
149: VecHIPGetArrayRead(xin,&xarray);
150: VecHIPGetArray(yin,&yarray);
151: PetscLogGpuTimeBegin();
152: if (alpha == (PetscScalar)0.0) {
153: hipMemcpy(yarray,xarray,bn*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
154: } else if (alpha == (PetscScalar)1.0) {
155: hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);
156: PetscLogGpuFlops(1.0*yin->map->n);
157: } else {
158: hipblasXscal(hipblasv2handle,bn,&alpha,yarray,one);
159: hipblasXaxpy(hipblasv2handle,bn,&sone,xarray,one,yarray,one);
160: PetscLogGpuFlops(2.0*yin->map->n);
161: }
162: PetscLogGpuTimeEnd();
163: VecHIPRestoreArrayRead(xin,&xarray);
164: VecHIPRestoreArray(yin,&yarray);
165: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
166: return 0;
167: }
169: PetscErrorCode VecAXPY_SeqHIP(Vec yin,PetscScalar alpha,Vec xin)
170: {
171: const PetscScalar *xarray;
172: PetscScalar *yarray;
173: PetscBLASInt one = 1,bn = 0;
174: hipblasHandle_t hipblasv2handle;
175: PetscBool xiship;
177: if (alpha == (PetscScalar)0.0) return 0;
178: PetscHIPBLASGetHandle(&hipblasv2handle);
179: PetscObjectTypeCompareAny((PetscObject)xin,&xiship,VECSEQHIP,VECMPIHIP,"");
180: if (xiship) {
181: PetscBLASIntCast(yin->map->n,&bn);
182: VecHIPGetArrayRead(xin,&xarray);
183: VecHIPGetArray(yin,&yarray);
184: PetscLogGpuTimeBegin();
185: hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);
186: PetscLogGpuTimeEnd();
187: VecHIPRestoreArrayRead(xin,&xarray);
188: VecHIPRestoreArray(yin,&yarray);
189: PetscLogGpuFlops(2.0*yin->map->n);
190: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
191: } else {
192: VecAXPY_Seq(yin,alpha,xin);
193: }
194: return 0;
195: }
197: PetscErrorCode VecPointwiseDivide_SeqHIP(Vec win, Vec xin, Vec yin)
198: {
199: PetscInt n = xin->map->n;
200: const PetscScalar *xarray=NULL,*yarray=NULL;
201: PetscScalar *warray=NULL;
202: thrust::device_ptr<const PetscScalar> xptr,yptr;
203: thrust::device_ptr<PetscScalar> wptr;
205: if (xin->boundtocpu || yin->boundtocpu) {
206: VecPointwiseDivide_Seq(win,xin,yin);
207: return 0;
208: }
209: VecHIPGetArrayWrite(win,&warray);
210: VecHIPGetArrayRead(xin,&xarray);
211: VecHIPGetArrayRead(yin,&yarray);
212: PetscLogGpuTimeBegin();
213: try {
214: wptr = thrust::device_pointer_cast(warray);
215: xptr = thrust::device_pointer_cast(xarray);
216: yptr = thrust::device_pointer_cast(yarray);
217: #if defined(PETSC_USE_COMPLEX)
218: thrust::transform(xptr,xptr+n,yptr,wptr,PetscDivideComplex());
219: #else
220: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
221: #endif
222: } catch (char *ex) {
223: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
224: }
225: PetscLogGpuTimeEnd();
226: PetscLogGpuFlops(n);
227: VecHIPRestoreArrayRead(xin,&xarray);
228: VecHIPRestoreArrayRead(yin,&yarray);
229: VecHIPRestoreArrayWrite(win,&warray);
230: return 0;
231: }
233: PetscErrorCode VecWAXPY_SeqHIP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
234: {
235: const PetscScalar *xarray=NULL,*yarray=NULL;
236: PetscScalar *warray=NULL;
237: PetscBLASInt one = 1,bn = 0;
238: hipblasHandle_t hipblasv2handle;
240: PetscHIPBLASGetHandle(&hipblasv2handle);
241: PetscBLASIntCast(win->map->n,&bn);
242: if (alpha == (PetscScalar)0.0) {
243: VecCopy_SeqHIP(yin,win);
244: } else {
245: VecHIPGetArrayRead(xin,&xarray);
246: VecHIPGetArrayRead(yin,&yarray);
247: VecHIPGetArrayWrite(win,&warray);
248: PetscLogGpuTimeBegin();
249: hipMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
250: hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,warray,one);
251: PetscLogGpuTimeEnd();
252: PetscLogGpuFlops(2*win->map->n);
253: VecHIPRestoreArrayRead(xin,&xarray);
254: VecHIPRestoreArrayRead(yin,&yarray);
255: VecHIPRestoreArrayWrite(win,&warray);
256: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
257: }
258: return 0;
259: }
261: PetscErrorCode VecMAXPY_SeqHIP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
262: {
263: PetscInt n = xin->map->n,j;
264: PetscScalar *xarray;
265: const PetscScalar *yarray;
266: PetscBLASInt one = 1,bn = 0;
267: hipblasHandle_t hipblasv2handle;
268: hipblasStatus_t cberr;
270: PetscLogGpuFlops(nv*2.0*n);
271: PetscLogCpuToGpuScalar(nv*sizeof(PetscScalar));
272: PetscHIPBLASGetHandle(&hipblasv2handle);
273: PetscBLASIntCast(n,&bn);
274: VecHIPGetArray(xin,&xarray);
275: PetscLogGpuTimeBegin();
276: for (j=0; j<nv; j++) {
277: VecHIPGetArrayRead(y[j],&yarray);
278: hipblasXaxpy(hipblasv2handle,bn,alpha+j,yarray,one,xarray,one);
279: VecHIPRestoreArrayRead(y[j],&yarray);
280: }
281: PetscLogGpuTimeEnd();
282: VecHIPRestoreArray(xin,&xarray);
283: return 0;
284: }
286: PetscErrorCode VecDot_SeqHIP(Vec xin,Vec yin,PetscScalar *z)
287: {
288: const PetscScalar *xarray,*yarray;
289: PetscBLASInt one = 1,bn = 0;
290: hipblasHandle_t hipblasv2handle;
291: hipblasStatus_t cerr;
293: PetscHIPBLASGetHandle(&hipblasv2handle);
294: PetscBLASIntCast(yin->map->n,&bn);
295: VecHIPGetArrayRead(xin,&xarray);
296: VecHIPGetArrayRead(yin,&yarray);
297: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
298: PetscLogGpuTimeBegin();
299: hipblasXdot(hipblasv2handle,bn,yarray,one,xarray,one,z);
300: PetscLogGpuTimeEnd();
301: if (xin->map->n >0) {
302: PetscLogGpuFlops(2.0*xin->map->n-1);
303: }
304: PetscLogGpuToCpu(sizeof(PetscScalar));
305: VecHIPRestoreArrayRead(xin,&xarray);
306: VecHIPRestoreArrayRead(yin,&yarray);
307: return 0;
308: }
310: //
311: // HIP kernels for MDot to follow
312: //
314: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
315: #define MDOT_WORKGROUP_SIZE 128
316: #define MDOT_WORKGROUP_NUM 128
318: #if !defined(PETSC_USE_COMPLEX)
319: // M = 2:
320: __global__ void VecMDot_SeqHIP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
321: PetscInt size, PetscScalar *group_results)
322: {
323: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
324: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
325: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
326: PetscInt vec_start_index = blockIdx.x * entries_per_group;
327: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
329: PetscScalar entry_x = 0;
330: PetscScalar group_sum0 = 0;
331: PetscScalar group_sum1 = 0;
332: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
333: entry_x = x[i]; // load only once from global memory!
334: group_sum0 += entry_x * y0[i];
335: group_sum1 += entry_x * y1[i];
336: }
337: tmp_buffer[threadIdx.x] = group_sum0;
338: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
340: // parallel reduction
341: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
342: __syncthreads();
343: if (threadIdx.x < stride) {
344: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
345: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
346: }
347: }
349: // write result of group to group_results
350: if (threadIdx.x == 0) {
351: group_results[blockIdx.x] = tmp_buffer[0];
352: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
353: }
354: }
356: // M = 3:
357: __global__ void VecMDot_SeqHIP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
358: PetscInt size, PetscScalar *group_results)
359: {
360: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
361: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
362: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
363: PetscInt vec_start_index = blockIdx.x * entries_per_group;
364: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
366: PetscScalar entry_x = 0;
367: PetscScalar group_sum0 = 0;
368: PetscScalar group_sum1 = 0;
369: PetscScalar group_sum2 = 0;
370: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
371: entry_x = x[i]; // load only once from global memory!
372: group_sum0 += entry_x * y0[i];
373: group_sum1 += entry_x * y1[i];
374: group_sum2 += entry_x * y2[i];
375: }
376: tmp_buffer[threadIdx.x] = group_sum0;
377: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
378: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
380: // parallel reduction
381: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
382: __syncthreads();
383: if (threadIdx.x < stride) {
384: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
385: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
386: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
387: }
388: }
390: // write result of group to group_results
391: if (threadIdx.x == 0) {
392: group_results[blockIdx.x ] = tmp_buffer[0];
393: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
394: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
395: }
396: }
398: // M = 4:
399: __global__ void VecMDot_SeqHIP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
400: PetscInt size, PetscScalar *group_results)
401: {
402: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
403: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
404: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
405: PetscInt vec_start_index = blockIdx.x * entries_per_group;
406: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
408: PetscScalar entry_x = 0;
409: PetscScalar group_sum0 = 0;
410: PetscScalar group_sum1 = 0;
411: PetscScalar group_sum2 = 0;
412: PetscScalar group_sum3 = 0;
413: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
414: entry_x = x[i]; // load only once from global memory!
415: group_sum0 += entry_x * y0[i];
416: group_sum1 += entry_x * y1[i];
417: group_sum2 += entry_x * y2[i];
418: group_sum3 += entry_x * y3[i];
419: }
420: tmp_buffer[threadIdx.x] = group_sum0;
421: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
422: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
423: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
425: // parallel reduction
426: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
427: __syncthreads();
428: if (threadIdx.x < stride) {
429: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
430: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
431: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
432: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
433: }
434: }
436: // write result of group to group_results
437: if (threadIdx.x == 0) {
438: group_results[blockIdx.x ] = tmp_buffer[0];
439: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
440: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
441: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
442: }
443: }
445: // M = 8:
446: __global__ void VecMDot_SeqHIP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
447: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
448: PetscInt size, PetscScalar *group_results)
449: {
450: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
451: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
452: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
453: PetscInt vec_start_index = blockIdx.x * entries_per_group;
454: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
456: PetscScalar entry_x = 0;
457: PetscScalar group_sum0 = 0;
458: PetscScalar group_sum1 = 0;
459: PetscScalar group_sum2 = 0;
460: PetscScalar group_sum3 = 0;
461: PetscScalar group_sum4 = 0;
462: PetscScalar group_sum5 = 0;
463: PetscScalar group_sum6 = 0;
464: PetscScalar group_sum7 = 0;
465: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
466: entry_x = x[i]; // load only once from global memory!
467: group_sum0 += entry_x * y0[i];
468: group_sum1 += entry_x * y1[i];
469: group_sum2 += entry_x * y2[i];
470: group_sum3 += entry_x * y3[i];
471: group_sum4 += entry_x * y4[i];
472: group_sum5 += entry_x * y5[i];
473: group_sum6 += entry_x * y6[i];
474: group_sum7 += entry_x * y7[i];
475: }
476: tmp_buffer[threadIdx.x] = group_sum0;
477: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
478: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
479: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
480: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
481: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
482: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
483: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
485: // parallel reduction
486: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
487: __syncthreads();
488: if (threadIdx.x < stride) {
489: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
490: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
491: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
492: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
493: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
494: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
495: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
496: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
497: }
498: }
500: // write result of group to group_results
501: if (threadIdx.x == 0) {
502: group_results[blockIdx.x ] = tmp_buffer[0];
503: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
504: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
505: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
506: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
507: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
508: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
509: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
510: }
511: }
512: #endif /* !defined(PETSC_USE_COMPLEX) */
514: PetscErrorCode VecMDot_SeqHIP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
515: {
516: PetscInt i,n = xin->map->n,current_y_index = 0;
517: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
518: #if !defined(PETSC_USE_COMPLEX)
519: PetscInt nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
520: PetscScalar *group_results_gpu,*group_results_cpu;
521: #endif
522: PetscBLASInt one = 1,bn = 0;
523: hipblasHandle_t hipblasv2handle;
525: PetscHIPBLASGetHandle(&hipblasv2handle);
526: PetscBLASIntCast(xin->map->n,&bn);
528: /* Handle the case of local size zero first */
529: if (!xin->map->n) {
530: for (i=0; i<nv; ++i) z[i] = 0;
531: return 0;
532: }
534: #if !defined(PETSC_USE_COMPLEX)
535: // allocate scratchpad memory for the results of individual work groups:
536: hipMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
537: PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
538: #endif
539: VecHIPGetArrayRead(xin,&xptr);
540: PetscLogGpuTimeBegin();
542: while (current_y_index < nv)
543: {
544: switch (nv - current_y_index) {
546: case 7:
547: case 6:
548: case 5:
549: case 4:
550: VecHIPGetArrayRead(yin[current_y_index ],&y0ptr);
551: VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
552: VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
553: VecHIPGetArrayRead(yin[current_y_index+3],&y3ptr);
554: #if defined(PETSC_USE_COMPLEX)
555: hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
556: hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
557: hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
558: hipblasXdot(hipblasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);
559: #else
560: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel4, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
561: #endif
562: VecHIPRestoreArrayRead(yin[current_y_index ],&y0ptr);
563: VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
564: VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
565: VecHIPRestoreArrayRead(yin[current_y_index+3],&y3ptr);
566: current_y_index += 4;
567: break;
569: case 3:
570: VecHIPGetArrayRead(yin[current_y_index ],&y0ptr);
571: VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
572: VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
574: #if defined(PETSC_USE_COMPLEX)
575: hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
576: hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
577: hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
578: #else
579: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel3, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
580: #endif
581: VecHIPRestoreArrayRead(yin[current_y_index ],&y0ptr);
582: VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
583: VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
584: current_y_index += 3;
585: break;
587: case 2:
588: VecHIPGetArrayRead(yin[current_y_index],&y0ptr);
589: VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
590: #if defined(PETSC_USE_COMPLEX)
591: hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
592: hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
593: #else
594: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel2, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
595: #endif
596: VecHIPRestoreArrayRead(yin[current_y_index],&y0ptr);
597: VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
598: current_y_index += 2;
599: break;
601: case 1:
602: VecHIPGetArrayRead(yin[current_y_index],&y0ptr);
603: hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
604: VecHIPRestoreArrayRead(yin[current_y_index],&y0ptr);
605: current_y_index += 1;
606: break;
608: default: // 8 or more vectors left
609: VecHIPGetArrayRead(yin[current_y_index ],&y0ptr);
610: VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
611: VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
612: VecHIPGetArrayRead(yin[current_y_index+3],&y3ptr);
613: VecHIPGetArrayRead(yin[current_y_index+4],&y4ptr);
614: VecHIPGetArrayRead(yin[current_y_index+5],&y5ptr);
615: VecHIPGetArrayRead(yin[current_y_index+6],&y6ptr);
616: VecHIPGetArrayRead(yin[current_y_index+7],&y7ptr);
617: #if defined(PETSC_USE_COMPLEX)
618: hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
619: hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
620: hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
621: hipblasXdot(hipblasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);
622: hipblasXdot(hipblasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);
623: hipblasXdot(hipblasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);
624: hipblasXdot(hipblasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);
625: hipblasXdot(hipblasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);
626: #else
627: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel8, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
628: #endif
629: VecHIPRestoreArrayRead(yin[current_y_index ],&y0ptr);
630: VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
631: VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
632: VecHIPRestoreArrayRead(yin[current_y_index+3],&y3ptr);
633: VecHIPRestoreArrayRead(yin[current_y_index+4],&y4ptr);
634: VecHIPRestoreArrayRead(yin[current_y_index+5],&y5ptr);
635: VecHIPRestoreArrayRead(yin[current_y_index+6],&y6ptr);
636: VecHIPRestoreArrayRead(yin[current_y_index+7],&y7ptr);
637: current_y_index += 8;
638: break;
639: }
640: }
641: PetscLogGpuTimeEnd();
642: VecHIPRestoreArrayRead(xin,&xptr);
644: #if defined(PETSC_USE_COMPLEX)
645: PetscLogGpuToCpu(nv*sizeof(PetscScalar));
646: #else
647: // copy results to CPU
648: hipMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,hipMemcpyDeviceToHost);
650: // sum group results into z
651: for (j=0; j<nv1; ++j) {
652: z[j] = 0;
653: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
654: }
655: PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
656: hipFree(group_results_gpu);
657: PetscFree(group_results_cpu);
658: PetscLogGpuToCpu(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
659: #endif
660: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
661: return 0;
662: }
664: #undef MDOT_WORKGROUP_SIZE
665: #undef MDOT_WORKGROUP_NUM
667: PetscErrorCode VecSet_SeqHIP(Vec xin,PetscScalar alpha)
668: {
669: PetscInt n = xin->map->n;
670: PetscScalar *xarray = NULL;
671: thrust::device_ptr<PetscScalar> xptr;
673: VecHIPGetArrayWrite(xin,&xarray);
674: PetscLogGpuTimeBegin();
675: if (alpha == (PetscScalar)0.0) {
676: hipMemset(xarray,0,n*sizeof(PetscScalar));
677: } else {
678: try {
679: xptr = thrust::device_pointer_cast(xarray);
680: thrust::fill(xptr,xptr+n,alpha);
681: } catch (char *ex) {
682: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
683: }
684: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
685: }
686: PetscLogGpuTimeEnd();
687: VecHIPRestoreArrayWrite(xin,&xarray);
688: return 0;
689: }
691: struct PetscScalarReciprocal
692: {
693: __host__ __device__
694: PetscScalar operator()(const PetscScalar& s)
695: {
696: #if defined(PETSC_USE_COMPLEX)
697: /* SPOCK compilation issue, need to unroll division */
698: PetscReal sx = PetscRealPart(s);
699: PetscReal sy = PetscImaginaryPart(s);
700: PetscReal n = sx*sx + sy*sy;
701: return n != 0.0 ? PetscComplex(sx/n,-sy/n) : 0.0;
702: #else
703: return (s != (PetscScalar)0.0) ? (PetscScalar)1.0/s : 0.0;
704: #endif
705: }
706: };
708: PetscErrorCode VecReciprocal_SeqHIP(Vec v)
709: {
710: PetscInt n;
711: PetscScalar *x;
713: VecGetLocalSize(v,&n);
714: VecHIPGetArray(v,&x);
715: PetscLogGpuTimeBegin();
716: try {
717: auto xptr = thrust::device_pointer_cast(x);
718: thrust::transform(xptr,xptr+n,xptr,PetscScalarReciprocal());
719: } catch (char *ex) {
720: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
721: }
722: PetscLogGpuTimeEnd();
723: VecHIPRestoreArray(v,&x);
724: return 0;
725: }
727: PetscErrorCode VecScale_SeqHIP(Vec xin,PetscScalar alpha)
728: {
729: PetscScalar *xarray;
730: PetscBLASInt one = 1,bn = 0;
731: hipblasHandle_t hipblasv2handle;
733: if (alpha == (PetscScalar)0.0) {
734: VecSet_SeqHIP(xin,alpha);
735: } else if (alpha != (PetscScalar)1.0) {
736: PetscHIPBLASGetHandle(&hipblasv2handle);
737: PetscBLASIntCast(xin->map->n,&bn);
738: VecHIPGetArray(xin,&xarray);
739: PetscLogGpuTimeBegin();
740: hipblasXscal(hipblasv2handle,bn,&alpha,xarray,one);
741: VecHIPRestoreArray(xin,&xarray);
742: PetscLogGpuTimeEnd();
743: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
744: PetscLogGpuFlops(xin->map->n);
745: }
746: return 0;
747: }
749: PetscErrorCode VecTDot_SeqHIP(Vec xin,Vec yin,PetscScalar *z)
750: {
751: const PetscScalar *xarray,*yarray;
752: PetscBLASInt one = 1,bn = 0;
753: hipblasHandle_t hipblasv2handle;
754: hipblasStatus_t cerr;
756: PetscHIPBLASGetHandle(&hipblasv2handle);
757: PetscBLASIntCast(xin->map->n,&bn);
758: VecHIPGetArrayRead(xin,&xarray);
759: VecHIPGetArrayRead(yin,&yarray);
760: PetscLogGpuTimeBegin();
761: hipblasXdotu(hipblasv2handle,bn,xarray,one,yarray,one,z);
762: PetscLogGpuTimeEnd();
763: if (xin->map->n > 0) {
764: PetscLogGpuFlops(2.0*xin->map->n-1);
765: }
766: PetscLogGpuToCpu(sizeof(PetscScalar));
767: VecHIPRestoreArrayRead(yin,&yarray);
768: VecHIPRestoreArrayRead(xin,&xarray);
769: return 0;
770: }
772: PetscErrorCode VecCopy_SeqHIP(Vec xin,Vec yin)
773: {
774: const PetscScalar *xarray;
775: PetscScalar *yarray;
777: if (xin != yin) {
778: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
779: PetscBool yiship;
781: PetscObjectTypeCompareAny((PetscObject)yin,&yiship,VECSEQHIP,VECMPIHIP,"");
782: VecHIPGetArrayRead(xin,&xarray);
783: if (yiship) {
784: VecHIPGetArrayWrite(yin,&yarray);
785: } else {
786: VecGetArrayWrite(yin,&yarray);
787: }
788: PetscLogGpuTimeBegin();
789: if (yiship) {
790: hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
791: } else {
792: hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToHost);
793: }
794: PetscLogGpuTimeEnd();
795: VecHIPRestoreArrayRead(xin,&xarray);
796: if (yiship) {
797: VecHIPRestoreArrayWrite(yin,&yarray);
798: } else {
799: VecRestoreArrayWrite(yin,&yarray);
800: }
801: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
802: /* copy in CPU if we are on the CPU */
803: VecCopy_SeqHIP_Private(xin,yin);
804: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
805: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
806: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
807: /* copy in CPU */
808: VecCopy_SeqHIP_Private(xin,yin);
809: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
810: /* copy in GPU */
811: VecHIPGetArrayRead(xin,&xarray);
812: VecHIPGetArrayWrite(yin,&yarray);
813: PetscLogGpuTimeBegin();
814: hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
815: PetscLogGpuTimeEnd();
816: VecHIPRestoreArrayRead(xin,&xarray);
817: VecHIPRestoreArrayWrite(yin,&yarray);
818: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
819: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
820: default to copy in GPU (this is an arbitrary choice) */
821: VecHIPGetArrayRead(xin,&xarray);
822: VecHIPGetArrayWrite(yin,&yarray);
823: PetscLogGpuTimeBegin();
824: hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
825: PetscLogGpuTimeEnd();
826: VecHIPRestoreArrayRead(xin,&xarray);
827: VecHIPRestoreArrayWrite(yin,&yarray);
828: } else {
829: VecCopy_SeqHIP_Private(xin,yin);
830: }
831: }
832: }
833: return 0;
834: }
836: PetscErrorCode VecSwap_SeqHIP(Vec xin,Vec yin)
837: {
838: PetscBLASInt one = 1,bn;
839: PetscScalar *xarray,*yarray;
840: hipblasHandle_t hipblasv2handle;
842: PetscHIPBLASGetHandle(&hipblasv2handle);
843: PetscBLASIntCast(xin->map->n,&bn);
844: if (xin != yin) {
845: VecHIPGetArray(xin,&xarray);
846: VecHIPGetArray(yin,&yarray);
847: PetscLogGpuTimeBegin();
848: hipblasXswap(hipblasv2handle,bn,xarray,one,yarray,one);
849: PetscLogGpuTimeEnd();
850: VecHIPRestoreArray(xin,&xarray);
851: VecHIPRestoreArray(yin,&yarray);
852: }
853: return 0;
854: }
856: PetscErrorCode VecAXPBY_SeqHIP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
857: {
858: PetscScalar a = alpha,b = beta;
859: const PetscScalar *xarray;
860: PetscScalar *yarray;
861: PetscBLASInt one = 1, bn = 0;
862: hipblasHandle_t hipblasv2handle;
864: PetscHIPBLASGetHandle(&hipblasv2handle);
865: PetscBLASIntCast(yin->map->n,&bn);
866: if (a == (PetscScalar)0.0) {
867: VecScale_SeqHIP(yin,beta);
868: } else if (b == (PetscScalar)1.0) {
869: VecAXPY_SeqHIP(yin,alpha,xin);
870: } else if (a == (PetscScalar)1.0) {
871: VecAYPX_SeqHIP(yin,beta,xin);
872: } else if (b == (PetscScalar)0.0) {
873: VecHIPGetArrayRead(xin,&xarray);
874: VecHIPGetArray(yin,&yarray);
875: PetscLogGpuTimeBegin();
876: hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
877: hipblasXscal(hipblasv2handle,bn,&alpha,yarray,one);
878: PetscLogGpuTimeEnd();
879: PetscLogGpuFlops(xin->map->n);
880: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
881: VecHIPRestoreArrayRead(xin,&xarray);
882: VecHIPRestoreArray(yin,&yarray);
883: } else {
884: VecHIPGetArrayRead(xin,&xarray);
885: VecHIPGetArray(yin,&yarray);
886: PetscLogGpuTimeBegin();
887: hipblasXscal(hipblasv2handle,bn,&beta,yarray,one);
888: hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);
889: VecHIPRestoreArrayRead(xin,&xarray);
890: VecHIPRestoreArray(yin,&yarray);
891: PetscLogGpuTimeEnd();
892: PetscLogGpuFlops(3.0*xin->map->n);
893: PetscLogCpuToGpuScalar(2*sizeof(PetscScalar));
894: }
895: return 0;
896: }
898: PetscErrorCode VecAXPBYPCZ_SeqHIP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
899: {
900: PetscInt n = zin->map->n;
902: if (gamma == (PetscScalar)1.0) {
903: /* z = ax + b*y + z */
904: VecAXPY_SeqHIP(zin,alpha,xin);
905: VecAXPY_SeqHIP(zin,beta,yin);
906: PetscLogGpuFlops(4.0*n);
907: } else {
908: /* z = a*x + b*y + c*z */
909: VecScale_SeqHIP(zin,gamma);
910: VecAXPY_SeqHIP(zin,alpha,xin);
911: VecAXPY_SeqHIP(zin,beta,yin);
912: PetscLogGpuFlops(5.0*n);
913: }
914: return 0;
915: }
917: PetscErrorCode VecPointwiseMult_SeqHIP(Vec win,Vec xin,Vec yin)
918: {
919: PetscInt n = win->map->n;
920: const PetscScalar *xarray,*yarray;
921: PetscScalar *warray;
922: thrust::device_ptr<const PetscScalar> xptr,yptr;
923: thrust::device_ptr<PetscScalar> wptr;
925: if (xin->boundtocpu || yin->boundtocpu) {
926: VecPointwiseMult_Seq(win,xin,yin);
927: return 0;
928: }
929: VecHIPGetArrayRead(xin,&xarray);
930: VecHIPGetArrayRead(yin,&yarray);
931: VecHIPGetArrayWrite(win,&warray);
932: PetscLogGpuTimeBegin();
933: try {
934: wptr = thrust::device_pointer_cast(warray);
935: xptr = thrust::device_pointer_cast(xarray);
936: yptr = thrust::device_pointer_cast(yarray);
937: #if defined(PETSC_USE_COMPLEX)
938: thrust::transform(xptr,xptr+n,yptr,wptr,PetscMultiplyComplex());
939: #else
940: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
941: #endif
942: } catch (char *ex) {
943: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
944: }
945: PetscLogGpuTimeEnd();
946: VecHIPRestoreArrayRead(xin,&xarray);
947: VecHIPRestoreArrayRead(yin,&yarray);
948: VecHIPRestoreArrayWrite(win,&warray);
949: PetscLogGpuFlops(n);
950: return 0;
951: }
953: /* should do infinity norm in hip */
955: PetscErrorCode VecNorm_SeqHIP(Vec xin,NormType type,PetscReal *z)
956: {
957: PetscInt n = xin->map->n;
958: PetscBLASInt one = 1, bn = 0;
959: const PetscScalar *xarray;
960: hipblasHandle_t hipblasv2handle;
962: PetscHIPBLASGetHandle(&hipblasv2handle);
963: PetscBLASIntCast(n,&bn);
964: if (type == NORM_2 || type == NORM_FROBENIUS) {
965: VecHIPGetArrayRead(xin,&xarray);
966: PetscLogGpuTimeBegin();
967: hipblasXnrm2(hipblasv2handle,bn,xarray,one,z);
968: PetscLogGpuTimeEnd();
969: VecHIPRestoreArrayRead(xin,&xarray);
970: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
971: } else if (type == NORM_INFINITY) {
972: int i;
973: VecHIPGetArrayRead(xin,&xarray);
974: PetscLogGpuTimeBegin();
975: hipblasIXamax(hipblasv2handle,bn,xarray,one,&i);
976: PetscLogGpuTimeEnd();
977: if (bn) {
978: PetscScalar zs;
979: hipMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),hipMemcpyDeviceToHost);
980: *z = PetscAbsScalar(zs);
981: } else *z = 0.0;
982: VecHIPRestoreArrayRead(xin,&xarray);
983: } else if (type == NORM_1) {
984: VecHIPGetArrayRead(xin,&xarray);
985: PetscLogGpuTimeBegin();
986: hipblasXasum(hipblasv2handle,bn,xarray,one,z);
987: VecHIPRestoreArrayRead(xin,&xarray);
988: PetscLogGpuTimeEnd();
989: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
990: } else if (type == NORM_1_AND_2) {
991: VecNorm_SeqHIP(xin,NORM_1,z);
992: VecNorm_SeqHIP(xin,NORM_2,z+1);
993: }
994: PetscLogGpuToCpu(sizeof(PetscReal));
995: return 0;
996: }
998: PetscErrorCode VecDotNorm2_SeqHIP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
999: {
1000: VecDot_SeqHIP(s,t,dp);
1001: VecDot_SeqHIP(t,t,nm);
1002: return 0;
1003: }
1005: PetscErrorCode VecDestroy_SeqHIP(Vec v)
1006: {
1007: if (v->spptr) {
1008: if (((Vec_HIP*)v->spptr)->GPUarray_allocated) {
1009: hipFree(((Vec_HIP*)v->spptr)->GPUarray_allocated);
1010: ((Vec_HIP*)v->spptr)->GPUarray_allocated = NULL;
1011: }
1012: if (((Vec_HIP*)v->spptr)->stream) {
1013: hipStreamDestroy(((Vec_HIP*)v->spptr)->stream);
1014: }
1015: }
1016: VecDestroy_SeqHIP_Private(v);
1017: PetscFree(v->spptr);
1018: return 0;
1019: }
1021: #if defined(PETSC_USE_COMPLEX)
1022: /* SPOCK compilation issue, need to do conjugation ourselves */
1023: struct conjugate
1024: {
1025: __host__ __device__
1026: PetscScalar operator()(const PetscScalar& x)
1027: {
1028: return PetscScalar(PetscRealPart(x),-PetscImaginaryPart(x));
1029: }
1030: };
1031: #endif
1033: PetscErrorCode VecConjugate_SeqHIP(Vec xin)
1034: {
1035: #if defined(PETSC_USE_COMPLEX)
1036: PetscScalar *xarray;
1037: PetscInt n = xin->map->n;
1038: thrust::device_ptr<PetscScalar> xptr;
1040: VecHIPGetArray(xin,&xarray);
1041: PetscLogGpuTimeBegin();
1042: try {
1043: xptr = thrust::device_pointer_cast(xarray);
1044: thrust::transform(xptr,xptr+n,xptr,conjugate());
1045: } catch (char *ex) {
1046: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1047: }
1048: PetscLogGpuTimeEnd();
1049: VecHIPRestoreArray(xin,&xarray);
1050: #else
1051: #endif
1052: return 0;
1053: }
1055: static inline PetscErrorCode VecGetLocalVectorK_SeqHIP(Vec v,Vec w,PetscBool read)
1056: {
1057: PetscBool wisseqhip;
1062: PetscObjectTypeCompare((PetscObject)w,VECSEQHIP,&wisseqhip);
1063: if (w->data && wisseqhip) {
1064: if (((Vec_Seq*)w->data)->array_allocated) {
1065: if (w->pinned_memory) {
1066: PetscMallocSetHIPHost();
1067: }
1068: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1069: if (w->pinned_memory) {
1070: PetscMallocResetHIPHost();
1071: w->pinned_memory = PETSC_FALSE;
1072: }
1073: }
1074: ((Vec_Seq*)w->data)->array = NULL;
1075: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1076: }
1077: if (w->spptr && wisseqhip) {
1078: if (((Vec_HIP*)w->spptr)->GPUarray) {
1079: hipFree(((Vec_HIP*)w->spptr)->GPUarray);
1080: ((Vec_HIP*)w->spptr)->GPUarray = NULL;
1081: }
1082: if (((Vec_HIP*)v->spptr)->stream) {
1083: hipStreamDestroy(((Vec_HIP*)w->spptr)->stream);
1084: }
1085: PetscFree(w->spptr);
1086: }
1088: if (v->petscnative && wisseqhip) {
1089: PetscFree(w->data);
1090: w->data = v->data;
1091: w->offloadmask = v->offloadmask;
1092: w->pinned_memory = v->pinned_memory;
1093: w->spptr = v->spptr;
1094: PetscObjectStateIncrease((PetscObject)w);
1095: } else {
1096: if (read) {
1097: VecGetArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1098: } else {
1099: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1100: }
1101: w->offloadmask = PETSC_OFFLOAD_CPU;
1102: if (wisseqhip) {
1103: VecHIPAllocateCheck(w);
1104: }
1105: }
1106: return 0;
1107: }
1109: static inline PetscErrorCode VecRestoreLocalVectorK_SeqHIP(Vec v,Vec w,PetscBool read)
1110: {
1111: PetscBool wisseqhip;
1116: PetscObjectTypeCompare((PetscObject)w,VECSEQHIP,&wisseqhip);
1117: if (v->petscnative && wisseqhip) {
1118: v->data = w->data;
1119: v->offloadmask = w->offloadmask;
1120: v->pinned_memory = w->pinned_memory;
1121: v->spptr = w->spptr;
1122: w->data = 0;
1123: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1124: w->spptr = 0;
1125: } else {
1126: if (read) {
1127: VecRestoreArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1128: } else {
1129: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1130: }
1131: if ((Vec_HIP*)w->spptr && wisseqhip) {
1132: hipFree(((Vec_HIP*)w->spptr)->GPUarray);
1133: ((Vec_HIP*)w->spptr)->GPUarray = NULL;
1134: if (((Vec_HIP*)v->spptr)->stream) {
1135: hipStreamDestroy(((Vec_HIP*)w->spptr)->stream);
1136: }
1137: PetscFree(w->spptr);
1138: }
1139: }
1140: return 0;
1141: }
1143: PetscErrorCode VecGetLocalVector_SeqHIP(Vec v,Vec w)
1144: {
1145: VecGetLocalVectorK_SeqHIP(v,w,PETSC_FALSE);
1146: return 0;
1147: }
1149: PetscErrorCode VecGetLocalVectorRead_SeqHIP(Vec v,Vec w)
1150: {
1151: VecGetLocalVectorK_SeqHIP(v,w,PETSC_TRUE);
1152: return 0;
1153: }
1155: PetscErrorCode VecRestoreLocalVector_SeqHIP(Vec v,Vec w)
1156: {
1157: VecRestoreLocalVectorK_SeqHIP(v,w,PETSC_FALSE);
1158: return 0;
1159: }
1161: PetscErrorCode VecRestoreLocalVectorRead_SeqHIP(Vec v,Vec w)
1162: {
1163: VecRestoreLocalVectorK_SeqHIP(v,w,PETSC_TRUE);
1164: return 0;
1165: }
1167: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1168: {
1169: __host__ __device__
1170: PetscReal operator()(const PetscScalar& x) {
1171: return PetscRealPart(x);
1172: }
1173: };
1175: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1176: {
1177: __host__ __device__
1178: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt>& x) {
1179: return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1180: }
1181: };
1183: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1184: {
1185: __host__ __device__
1186: PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1187: return x < y ? y : x;
1188: }
1189: };
1191: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1192: {
1193: __host__ __device__
1194: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1195: return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1196: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1197: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1198: }
1199: };
1201: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1202: {
1203: __host__ __device__
1204: PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1205: return x < y ? x : y;
1206: }
1207: };
1209: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1210: {
1211: __host__ __device__
1212: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1213: return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1214: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1215: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1216: }
1217: };
1219: PetscErrorCode VecMax_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1220: {
1221: PetscInt n = v->map->n;
1222: const PetscScalar *av;
1223: thrust::device_ptr<const PetscScalar> avpt;
1226: if (!n) {
1227: *m = PETSC_MIN_REAL;
1228: if (p) *p = -1;
1229: return 0;
1230: }
1231: VecHIPGetArrayRead(v,&av);
1232: avpt = thrust::device_pointer_cast(av);
1233: PetscLogGpuTimeBegin();
1234: if (p) {
1235: thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1236: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1237: try {
1238: #if defined(PETSC_USE_COMPLEX)
1239: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1240: #else
1241: res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1242: #endif
1243: } catch (char *ex) {
1244: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1245: }
1246: *m = res.get<0>();
1247: *p = res.get<1>();
1248: } else {
1249: try {
1250: #if defined(PETSC_USE_COMPLEX)
1251: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1252: #else
1253: *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1254: #endif
1255: } catch (char *ex) {
1256: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1257: }
1258: }
1259: PetscLogGpuTimeEnd();
1260: VecHIPRestoreArrayRead(v,&av);
1261: return 0;
1262: }
1264: PetscErrorCode VecMin_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1265: {
1266: PetscInt n = v->map->n;
1267: const PetscScalar *av;
1268: thrust::device_ptr<const PetscScalar> avpt;
1271: if (!n) {
1272: *m = PETSC_MAX_REAL;
1273: if (p) *p = -1;
1274: return 0;
1275: }
1276: VecHIPGetArrayRead(v,&av);
1277: avpt = thrust::device_pointer_cast(av);
1278: PetscLogGpuTimeBegin();
1279: if (p) {
1280: thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1281: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1282: try {
1283: #if defined(PETSC_USE_COMPLEX)
1284: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1285: #else
1286: res = thrust::reduce(zibit,zibit+n,res,petscmini());
1287: #endif
1288: } catch (char *ex) {
1289: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1290: }
1291: *m = res.get<0>();
1292: *p = res.get<1>();
1293: } else {
1294: try {
1295: #if defined(PETSC_USE_COMPLEX)
1296: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1297: #else
1298: *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1299: #endif
1300: } catch (char *ex) {
1301: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1302: }
1303: }
1304: PetscLogGpuTimeEnd();
1305: VecHIPRestoreArrayRead(v,&av);
1306: return 0;
1307: }
1309: PetscErrorCode VecSum_SeqHIP(Vec v,PetscScalar *sum)
1310: {
1311: PetscInt n = v->map->n;
1312: const PetscScalar *a;
1313: thrust::device_ptr<const PetscScalar> dptr;
1316: VecHIPGetArrayRead(v,&a);
1317: dptr = thrust::device_pointer_cast(a);
1318: PetscLogGpuTimeBegin();
1319: try {
1320: *sum = thrust::reduce(dptr,dptr+n,PetscScalar(0.0));
1321: } catch (char *ex) {
1322: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1323: }
1324: PetscLogGpuTimeEnd();
1325: VecHIPRestoreArrayRead(v,&a);
1326: return 0;
1327: }
1329: struct petscshift : public thrust::unary_function<PetscScalar,PetscScalar>
1330: {
1331: const PetscScalar shift_;
1332: petscshift(PetscScalar shift) : shift_(shift){}
1333: __host__ __device__
1334: PetscScalar operator()(PetscScalar x) {return x + shift_;}
1335: };
1337: PetscErrorCode VecShift_SeqHIP(Vec v,PetscScalar shift)
1338: {
1339: PetscInt n = v->map->n;
1340: PetscScalar *a;
1341: thrust::device_ptr<PetscScalar> dptr;
1344: VecHIPGetArray(v,&a);
1345: dptr = thrust::device_pointer_cast(a);
1346: PetscLogGpuTimeBegin();
1347: try {
1348: thrust::transform(dptr,dptr+n,dptr,petscshift(shift)); /* in-place transform */
1349: } catch (char *ex) {
1350: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1351: }
1352: PetscLogGpuTimeEnd();
1353: VecHIPRestoreArray(v,&a);
1354: return 0;
1355: }